src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_api.h"
  31 #include "fetch_jit.h"
  32 #include "builder.h"
  33 #include "state_llvm.h"
  34 #include "common/containers.hpp"
  35 #include "llvm/IR/DataLayout.h"
  36 #include <sstream>
  37 #include <tuple>
  38
  39 //#define FETCH_DUMP_VERTEX 1
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49 };
  50
  51 //////////////////////////////////////////////////////////////////////////
  52 /// Interface to Jitting a fetch shader
  53 //////////////////////////////////////////////////////////////////////////
  54 struct FetchJit : public Builder
  55 {
  56     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  57
  58     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  59     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  60     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  61     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  62
  63     // package up Shuffle*bpcGatherd args into a tuple for convenience
  64     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  65                        uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  66                        const uint32_t (&)[4]> Shuffle8bpcArgs;
  67     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  68
  69     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  70                        uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  71     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  72
  73     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  74
  75     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  76
  77     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
  78     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
  79
  80     bool IsOddFormat(SWR_FORMAT format);
  81     bool IsUniformFormat(SWR_FORMAT format);
  82     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
  83     void CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4]);
  84     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
  85
  86 };
  87
  88 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  89 {
  90     static std::size_t fetchNum = 0;
  91
  92     std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  93     fnName << fetchNum++;
  94
  95     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
  96     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
  97
  98     IRB()->SetInsertPoint(entry);
  99
 100     auto    argitr = fetch->getArgumentList().begin();
 101
 102     // Fetch shader arguments
 103     Value*    fetchInfo = &*argitr; ++argitr;
 104     fetchInfo->setName("fetchInfo");
 105     Value*    pVtxOut = &*argitr;
 106     pVtxOut->setName("vtxOutput");
 107     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 108     // index 0(just the pointer to the simdvertex structure
 109     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 110     // so the indices being i32's doesn't matter
 111     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 112     std::vector<Value*>    vtxInputIndices(2, C(0));
 113     // GEP
 114     pVtxOut = GEP(pVtxOut, C(0));
 115     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 116
 117     // SWR_FETCH_CONTEXT::pStreams
 118     Value*    streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 119     streams->setName("pStreams");
 120
 121     // SWR_FETCH_CONTEXT::pIndices
 122     Value*    indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 123     indices->setName("pIndices");
 124
 125     // SWR_FETCH_CONTEXT::pLastIndex
 126     Value*    pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 127     pLastIndex->setName("pLastIndex");
 128
 129
 130     Value* vIndices;
 131     switch(fetchState.indexType)
 132     {
 133         case R8_UINT:
 134             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 135             if(fetchState.bDisableIndexOOBCheck){
 136                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 137                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 138             }
 139             else{
 140                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 141                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 142             }
 143             break;
 144         case R16_UINT:
 145             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 146             if(fetchState.bDisableIndexOOBCheck){
 147                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 148                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 149             }
 150             else{
 151                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 152                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 153             }
 154             break;
 155         case R32_UINT:
 156             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 157                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 158             break; // incoming type is already 32bit int
 159         default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
 160     }
 161
 162     // store out vertex IDs
 163     STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 164
 165     // store out cut mask if enabled
 166     if (fetchState.bEnableCutIndex)
 167     {
 168         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 169         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 170         STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 171     }
 172
 173     // Fetch attributes from memory and output to a simdvertex struct
 174     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 175     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut)
 176                                  : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut);
 177
 178     RET_VOID();
 179
 180     JitManager::DumpToFile(fetch, "src");
 181
 182     verifyFunction(*fetch);
 183
 184 #if HAVE_LLVM == 0x306
 185         FunctionPassManager
 186 #else
 187         llvm::legacy::FunctionPassManager
 188 #endif
 189             setupPasses(JM()->mpCurrentModule);
 190
 191     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 192     setupPasses.add(createBreakCriticalEdgesPass());
 193     setupPasses.add(createCFGSimplificationPass());
 194     setupPasses.add(createEarlyCSEPass());
 195     setupPasses.add(createPromoteMemoryToRegisterPass());
 196
 197     setupPasses.run(*fetch);
 198
 199     JitManager::DumpToFile(fetch, "se");
 200
 201 #if HAVE_LLVM == 0x306
 202         FunctionPassManager
 203 #else
 204         llvm::legacy::FunctionPassManager
 205 #endif
 206             optPasses(JM()->mpCurrentModule);
 207
 208     ///@todo Haven't touched these either. Need to remove some of these and add others.
 209     optPasses.add(createCFGSimplificationPass());
 210     optPasses.add(createEarlyCSEPass());
 211     optPasses.add(createInstructionCombiningPass());
 212     optPasses.add(createInstructionSimplifierPass());
 213     optPasses.add(createConstantPropagationPass());
 214     optPasses.add(createSCCPPass());
 215     optPasses.add(createAggressiveDCEPass());
 216
 217     optPasses.run(*fetch);
 218     optPasses.run(*fetch);
 219
 220     JitManager::DumpToFile(fetch, "opt");
 221
 222     return fetch;
 223 }
 224
 225 //////////////////////////////////////////////////////////////////////////
 226 /// @brief Loads attributes from memory using LOADs, shuffling the
 227 /// components into SOA form.
 228 /// *Note* currently does not support component control,
 229 /// component packing, or instancing
 230 /// @param fetchState - info about attributes to be fetched from memory
 231 /// @param streams - value pointer to the current vertex stream
 232 /// @param vIndices - vector value of indices to load
 233 /// @param pVtxOut - value pointer to output simdvertex struct
 234 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut)
 235 {
 236     // Zack shuffles; a variant of the Charleston.
 237
 238     SWRL::UncheckedFixedVector<Value*, 16>    vectors;
 239
 240     std::vector<Constant*>    pMask(mVWidth);
 241     for(uint32_t i = 0; i < mVWidth; ++i)
 242     {
 243         pMask[i] = (C(i < 4 ? i : 4));
 244     }
 245     Constant* promoteMask = ConstantVector::get(pMask);
 246     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 247
 248     Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 249     Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 250     Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 251     Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 252     curInstance->setName("curInstance");
 253
 254     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 255     {
 256         Value*    elements[4] = {0};
 257         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 258         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 259         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 260         uint32_t    numComponents = info.numComps;
 261         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 262
 263         vectors.clear();
 264
 265         Value *vCurIndices;
 266         Value *startOffset;
 267         if(ied.InstanceEnable)
 268         {
 269             Value* stepRate = C(ied.InstanceDataStepRate);
 270
 271             // prevent a div by 0 for 0 step rate
 272             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 273             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 274
 275             // calc the current offset into instanced data buffer
 276             Value* calcInstance = UDIV(curInstance, stepRate);
 277
 278             // if step rate is 0, every instance gets instance 0
 279             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 280
 281             vCurIndices = VBROADCAST(calcInstance);
 282
 283             startOffset = startInstance;
 284         }
 285         else
 286         {
 287             // offset indices by baseVertex
 288             vCurIndices = ADD(vIndices, vBaseVertex);
 289
 290             startOffset = startVertex;
 291         }
 292
 293         // load SWR_VERTEX_BUFFER_STATE::pData
 294         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 295
 296         // load SWR_VERTEX_BUFFER_STATE::pitch
 297         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 298         stride = Z_EXT(stride, mInt64Ty);
 299
 300         // load SWR_VERTEX_BUFFER_STATE::size
 301         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 302         size = Z_EXT(size, mInt64Ty);
 303
 304         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 305
 306         // Load from the stream.
 307         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 308         {
 309             // Get index
 310             Value* index = VEXTRACT(vCurIndices, C(lane));
 311             index = Z_EXT(index, mInt64Ty);
 312
 313             Value*    offset = MUL(index, stride);
 314             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 315             offset = ADD(offset, startVertexOffset);
 316
 317             if (!fetchState.bDisableIndexOOBCheck) {
 318                 // check for out of bound access, including partial OOB, and mask them to 0
 319                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 320                 Value *oob = ICMP_ULE(endOffset, size);
 321                 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 322             }
 323
 324             Value*    pointer = GEP(stream, offset);
 325             // We use a full-lane, but don't actually care.
 326             Value*    vptr = 0;
 327
 328             // get a pointer to a 4 component attrib in default address space
 329             switch(bpc)
 330             {
 331                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 332                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 333                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 334                 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
 335             }
 336
 337             // load 4 components of attribute
 338             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 339
 340             // Convert To FP32 internally
 341             switch(info.type[0])
 342             {
 343                 case SWR_TYPE_UNORM:
 344                     switch(bpc)
 345                     {
 346                         case 8:
 347                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 348                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 349                             break;
 350                         case 16:
 351                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 352                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 353                             break;
 354                         default:
 355                             SWR_ASSERT(false, "Unsupported underlying type!");
 356                             break;
 357                     }
 358                     break;
 359                 case SWR_TYPE_SNORM:
 360                     switch(bpc)
 361                     {
 362                         case 8:
 363                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 364                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 365                             break;
 366                         case 16:
 367                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 368                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 369                             break;
 370                         default:
 371                             SWR_ASSERT(false, "Unsupported underlying type!");
 372                             break;
 373                     }
 374                     break;
 375                 case SWR_TYPE_UINT:
 376                     // Zero extend uint32_t types.
 377                     switch(bpc)
 378                     {
 379                         case 8:
 380                         case 16:
 381                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 382                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 383                             break;
 384                         case 32:
 385                             break; // Pass through unchanged.
 386                         default:
 387                             SWR_ASSERT(false, "Unsupported underlying type!");
 388                             break;
 389                     }
 390                     break;
 391                 case SWR_TYPE_SINT:
 392                     // Sign extend SINT types.
 393                     switch(bpc)
 394                     {
 395                         case 8:
 396                         case 16:
 397                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 398                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 399                             break;
 400                         case 32:
 401                             break; // Pass through unchanged.
 402                         default:
 403                             SWR_ASSERT(false, "Unsupported underlying type!");
 404                             break;
 405                     }
 406                     break;
 407                 case SWR_TYPE_FLOAT:
 408                     switch(bpc)
 409                     {
 410                         case 32:
 411                             break; // Pass through unchanged.
 412                         default:
 413                             SWR_ASSERT(false, "Unsupported underlying type!");
 414                     }
 415                     break;
 416                 case SWR_TYPE_USCALED:
 417                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 418                     break;
 419                 case SWR_TYPE_SSCALED:
 420                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 421                     break;
 422                 case SWR_TYPE_UNKNOWN:
 423                 case SWR_TYPE_UNUSED:
 424                     SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
 425             }
 426
 427             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 428             // uwvec: 4 x F32, undef value
 429             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 430             vectors.push_back(wvec);
 431         }
 432
 433         std::vector<Constant*>        v01Mask(mVWidth);
 434         std::vector<Constant*>        v23Mask(mVWidth);
 435         std::vector<Constant*>        v02Mask(mVWidth);
 436         std::vector<Constant*>        v13Mask(mVWidth);
 437
 438         // Concatenate the vectors together.
 439         elements[0] = VUNDEF_F();
 440         elements[1] = VUNDEF_F();
 441         elements[2] = VUNDEF_F();
 442         elements[3] = VUNDEF_F();
 443         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 444         {
 445             v01Mask[4 * b + 0] = C(0 + 4 * b);
 446             v01Mask[4 * b + 1] = C(1 + 4 * b);
 447             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 448             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 449
 450             v23Mask[4 * b + 0] = C(2 + 4 * b);
 451             v23Mask[4 * b + 1] = C(3 + 4 * b);
 452             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 453             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 454
 455             v02Mask[4 * b + 0] = C(0 + 4 * b);
 456             v02Mask[4 * b + 1] = C(2 + 4 * b);
 457             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 458             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 459
 460             v13Mask[4 * b + 0] = C(1 + 4 * b);
 461             v13Mask[4 * b + 1] = C(3 + 4 * b);
 462             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 463             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 464
 465             std::vector<Constant*>    iMask(mVWidth);
 466             for(uint32_t i = 0; i < mVWidth; ++i)
 467             {
 468                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 469                 {
 470                     iMask[i] = C(i % 4 + mVWidth);
 471                 }
 472                 else
 473                 {
 474                     iMask[i] = C(i);
 475                 }
 476             }
 477             Constant* insertMask = ConstantVector::get(iMask);
 478             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 479             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 480             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 481             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 482         }
 483
 484         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 485         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 486         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 487         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 488         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 489         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 490         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 491         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 492
 493         switch(numComponents + 1)
 494         {
 495             case    1: elements[0] = VIMMED1(0.0f);
 496             case    2: elements[1] = VIMMED1(0.0f);
 497             case    3: elements[2] = VIMMED1(0.0f);
 498             case    4: elements[3] = VIMMED1(1.0f);
 499         }
 500
 501         for(uint32_t c = 0; c < 4; ++c)
 502         {
 503             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 504             STORE(elements[c], dest);
 505         }
 506     }
 507 }
 508
 509 // returns true for odd formats that require special state.gather handling
 510 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 511 {
 512     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 513     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
 514     {
 515         return true;
 516     }
 517     return false;
 518 }
 519
 520 // format is uniform if all components are the same size and type
 521 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 522 {
 523     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 524     uint32_t bpc0 = info.bpc[0];
 525     uint32_t type0 = info.type[0];
 526
 527     for (uint32_t c = 1; c < info.numComps; ++c)
 528     {
 529         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 530         {
 531             return false;
 532         }
 533     }
 534     return true;
 535 }
 536
 537 // unpacks components based on format
 538 // foreach component in the pixel
 539 //   mask off everything but this component
 540 //   shift component to LSB
 541 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 542 {
 543     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 544
 545     uint32_t bitOffset = 0;
 546     for (uint32_t c = 0; c < info.numComps; ++c)
 547     {
 548         uint32_t swizzledIndex = info.swizzle[c];
 549         uint32_t compBits = info.bpc[c];
 550         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 551         Value* comp = AND(vInput, bitmask);
 552         comp = LSHR(comp, bitOffset);
 553
 554         result[swizzledIndex] = comp;
 555         bitOffset += compBits;
 556     }
 557 }
 558
 559 // gather for odd component size formats
 560 // gather SIMD full pixels per lane then shift/mask to move each component to their
 561 // own vector
 562 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4])
 563 {
 564     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 565
 566     // only works if pixel size is <= 32bits
 567     SWR_ASSERT(info.bpp <= 32);
 568
 569     Value* gather = VUNDEF_I();
 570
 571     // assign defaults
 572     for (uint32_t comp = 0; comp < 4; ++comp)
 573     {
 574         result[comp] = VIMMED1((int)info.defaults[comp]);
 575     }
 576
 577     // gather SIMD pixels
 578     for (uint32_t e = 0; e < JM()->mVWidth; ++e)
 579     {
 580         Value* elemOffset = VEXTRACT(offsets, C(e));
 581         Value* load = GEP(pBase, elemOffset);
 582
 583         // load the proper amount of data based on component size
 584         switch (info.bpp)
 585         {
 586         case 8: load = POINTER_CAST(load, Type::getInt8PtrTy(JM()->mContext)); break;
 587         case 16: load = POINTER_CAST(load, Type::getInt16PtrTy(JM()->mContext)); break;
 588         case 32: load = POINTER_CAST(load, Type::getInt32PtrTy(JM()->mContext)); break;
 589         default: SWR_ASSERT(0);
 590         }
 591
 592         // load pixel
 593         Value *val = LOAD(load);
 594
 595         // zero extend to 32bit integer
 596         val = INT_CAST(val, mInt32Ty, false);
 597
 598         // store in simd lane
 599         gather = VINSERT(gather, val, C(e));
 600     }
 601
 602     UnpackComponents(format, gather, result);
 603
 604     // cast to fp32
 605     result[0] = BITCAST(result[0], mSimdFP32Ty);
 606     result[1] = BITCAST(result[1], mSimdFP32Ty);
 607     result[2] = BITCAST(result[2], mSimdFP32Ty);
 608     result[3] = BITCAST(result[3], mSimdFP32Ty);
 609 }
 610
 611 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 612 {
 613     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 614
 615     for (uint32_t c = 0; c < info.numComps; ++c)
 616     {
 617         uint32_t compIndex = info.swizzle[c];
 618
 619         // skip any conversion on UNUSED components
 620         if (info.type[c] == SWR_TYPE_UNUSED)
 621         {
 622             continue;
 623         }
 624
 625         if (info.isNormalized[c])
 626         {
 627             if (info.type[c] == SWR_TYPE_SNORM)
 628             {
 629                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 630
 631                 /// result = c * (1.0f / (2^(n-1) - 1);
 632                 uint32_t n = info.bpc[c];
 633                 uint32_t pow2 = 1 << (n - 1);
 634                 float scale = 1.0f / (float)(pow2 - 1);
 635                 Value *vScale = VIMMED1(scale);
 636                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 637                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 638                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 639             }
 640             else
 641             {
 642                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 643
 644                 /// result = c * (1.0f / (2^n - 1))
 645                 uint32_t n = info.bpc[c];
 646                 uint32_t pow2 = 1 << n;
 647                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 648                 if (n == 24)
 649                 {
 650                     float scale = (float)(pow2 - 1);
 651                     Value* vScale = VIMMED1(scale);
 652                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 653                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 654                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 655                 }
 656                 else
 657                 {
 658                     float scale = 1.0f / (float)(pow2 - 1);
 659                     Value *vScale = VIMMED1(scale);
 660                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 661                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 662                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 663                 }
 664             }
 665             continue;
 666         }
 667     }
 668 }
 669
 670 //////////////////////////////////////////////////////////////////////////
 671 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 672 /// @param fetchState - info about attributes to be fetched from memory
 673 /// @param fetchInfo - first argument passed to fetch shader
 674 /// @param streams - value pointer to the current vertex stream
 675 /// @param vIndices - vector value of indices to gather
 676 /// @param pVtxOut - value pointer to output simdvertex struct
 677 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo,
 678                                  Value* streams, Value* vIndices, Value* pVtxOut)
 679 {
 680     uint32_t currentVertexElement = 0;
 681     uint32_t outputElt = 0;
 682     Value* vVertexElements[4];
 683
 684     Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 685     Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 686     Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 687     Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 688     curInstance->setName("curInstance");
 689
 690     for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
 691     {
 692         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 693         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 694         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 695         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 696
 697         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 698
 699         // VGATHER* takes an *i8 src pointer
 700         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 701
 702         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 703         Value *vStride = VBROADCAST(stride);
 704
 705         // max vertex index that is fully in bounds
 706         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 707         maxVertex = LOAD(maxVertex);
 708
 709         Value *vCurIndices;
 710         Value *startOffset;
 711         if(ied.InstanceEnable)
 712         {
 713             Value* stepRate = C(ied.InstanceDataStepRate);
 714
 715             // prevent a div by 0 for 0 step rate
 716             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 717             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 718
 719             // calc the current offset into instanced data buffer
 720             Value* calcInstance = UDIV(curInstance, stepRate);
 721
 722             // if step rate is 0, every instance gets instance 0
 723             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 724
 725             vCurIndices = VBROADCAST(calcInstance);
 726
 727             startOffset = startInstance;
 728         }
 729         else
 730         {
 731             // offset indices by baseVertex
 732             vCurIndices = ADD(vIndices, vBaseVertex);
 733
 734             startOffset = startVertex;
 735         }
 736
 737         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 738         // do 64bit address offset calculations.
 739
 740         // calculate byte offset to the start of the VB
 741         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 742         pStreamBase = GEP(pStreamBase, baseOffset);
 743
 744         // if we have a start offset, subtract from max vertex. Used for OOB check
 745         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 746         Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
 747         // if we have a negative value, we're already OOB. clamp at 0.
 748         maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
 749
 750         // Load the in bounds size of a partially valid vertex
 751         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 752         partialInboundsSize = LOAD(partialInboundsSize);
 753         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 754         Value* vBpp = VBROADCAST(C(info.Bpp));
 755         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 756
 757         // is the element is <= the partially valid size
 758         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 759
 760         // are vertices partially OOB?
 761         Value* vMaxVertex = VBROADCAST(maxVertex);
 762         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 763
 764         // are vertices are fully in bounds?
 765         Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 766
 767         // blend in any partially OOB indices that have valid elements
 768         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 769         vGatherMask = VMASK(vGatherMask);
 770
 771         // calculate the actual offsets into the VB
 772         Value* vOffsets = MUL(vCurIndices, vStride);
 773         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 774
 775         // Packing and component control
 776         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 777         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 778                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 779
 780         // Special gather/conversion for formats without equal component sizes
 781         if (IsOddFormat((SWR_FORMAT)ied.Format))
 782         {
 783             // Only full 4 component fetch is supported for odd formats
 784             SWR_ASSERT(compMask == XYZW);
 785             Value* pResults[4];
 786             CreateGatherOddFormats((SWR_FORMAT)ied.Format, pStreamBase, vOffsets, pResults);
 787             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 788
 789             StoreVertexElements(pVtxOut, outputElt++, 4, pResults);
 790             currentVertexElement = 0;
 791         }
 792         else if(info.type[0] == SWR_TYPE_FLOAT)
 793         {
 794             ///@todo: support 64 bit vb accesses
 795             Value* gatherSrc = VIMMED1(0.0f);
 796
 797             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 798                 "Unsupported format for standard gather fetch.");
 799
 800             // Gather components from memory to store in a simdvertex structure
 801             switch(bpc)
 802             {
 803                 case 16:
 804                 {
 805                     Value* vGatherResult[2];
 806                     Value *vMask;
 807
 808                     // if we have at least one component out of x or y to fetch
 809                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 810                         // save mask as it is zero'd out after each gather
 811                         vMask = vGatherMask;
 812
 813                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 814                         // e.g. result of first 8x32bit integer gather for 16bit components
 815                         // 256i - 0    1    2    3    4    5    6    7
 816                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 817                         //
 818                     }
 819
 820                     // if we have at least one component out of z or w to fetch
 821                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 822                         // offset base to the next components(zw) in the vertex to gather
 823                         pStreamBase = GEP(pStreamBase, C((char)4));
 824                         vMask = vGatherMask;
 825
 826                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 827                         // e.g. result of second 8x32bit integer gather for 16bit components
 828                         // 256i - 0    1    2    3    4    5    6    7
 829                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 830                         //
 831                     }
 832
 833                     // if we have at least one component to shuffle into place
 834                     if(compMask){
 835                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 836                                                                       currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 837                         // Shuffle gathered components into place in simdvertex struct
 838                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 839                     }
 840                 }
 841                     break;
 842                 case 32:
 843                 {
 844                     for(uint32_t i = 0; i < 4; i++)
 845                     {
 846                         if(!isComponentEnabled(compMask, i)){
 847                             // offset base to the next component in the vertex to gather
 848                             pStreamBase = GEP(pStreamBase, C((char)4));
 849                             continue;
 850                         }
 851
 852                         // if we need to gather the component
 853                         if(compCtrl[i] == StoreSrc){
 854                             // save mask as it is zero'd out after each gather
 855                             Value *vMask = vGatherMask;
 856
 857                             // Gather a SIMD of vertices
 858                             vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 859                         }
 860                         else{
 861                             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 862                         }
 863
 864                         if(currentVertexElement > 3){
 865                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 866                             // reset to the next vVertexElement to output
 867                             currentVertexElement = 0;
 868                         }
 869
 870                         // offset base to the next component in the vertex to gather
 871                         pStreamBase = GEP(pStreamBase, C((char)4));
 872                     }
 873                 }
 874                     break;
 875                 default:
 876                     SWR_ASSERT(0, "Tried to fetch invalid FP format");
 877                     break;
 878             }
 879         }
 880         else
 881         {
 882             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 883             ConversionType conversionType = CONVERT_NONE;
 884
 885             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 886                 "Unsupported format for standard gather fetch.");
 887
 888             switch(info.type[0])
 889             {
 890                 case SWR_TYPE_UNORM:
 891                     conversionType = CONVERT_NORMALIZED;
 892                 case SWR_TYPE_UINT:
 893                     extendCastType = Instruction::CastOps::ZExt;
 894                     break;
 895                 case SWR_TYPE_SNORM:
 896                     conversionType = CONVERT_NORMALIZED;
 897                 case SWR_TYPE_SINT:
 898                     extendCastType = Instruction::CastOps::SExt;
 899                     break;
 900                 case SWR_TYPE_USCALED:
 901                     conversionType = CONVERT_USCALED;
 902                     extendCastType = Instruction::CastOps::UIToFP;
 903                     break;
 904                 case SWR_TYPE_SSCALED:
 905                     conversionType = CONVERT_SSCALED;
 906                     extendCastType = Instruction::CastOps::SIToFP;
 907                     break;
 908                 default:
 909                     break;
 910             }
 911
 912             // value substituted when component of gather is masked
 913             Value* gatherSrc = VIMMED1(0);
 914
 915             // Gather components from memory to store in a simdvertex structure
 916             switch (bpc)
 917             {
 918                 case 8:
 919                 {
 920                     // if we have at least one component to fetch
 921                     if(compMask){
 922                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
 923                         // e.g. result of an 8x32bit integer gather for 8bit components
 924                         // 256i - 0    1    2    3    4    5    6    7
 925                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 926
 927                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 928                                                                      currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
 929                         // Shuffle gathered components into place in simdvertex struct
 930                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 931                     }
 932                 }
 933                 break;
 934                 case 16:
 935                 {
 936                     Value* vGatherResult[2];
 937                     Value *vMask;
 938
 939                     // if we have at least one component out of x or y to fetch
 940                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 941                         // save mask as it is zero'd out after each gather
 942                         vMask = vGatherMask;
 943
 944                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 945                         // e.g. result of first 8x32bit integer gather for 16bit components
 946                         // 256i - 0    1    2    3    4    5    6    7
 947                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 948                         //
 949                     }
 950
 951                     // if we have at least one component out of z or w to fetch
 952                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 953                         // offset base to the next components(zw) in the vertex to gather
 954                         pStreamBase = GEP(pStreamBase, C((char)4));
 955                         vMask = vGatherMask;
 956
 957                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 958                         // e.g. result of second 8x32bit integer gather for 16bit components
 959                         // 256i - 0    1    2    3    4    5    6    7
 960                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 961                         //
 962                     }
 963
 964                     // if we have at least one component to shuffle into place
 965                     if(compMask){
 966                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 967                                                                       currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 968                         // Shuffle gathered components into place in simdvertex struct
 969                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 970                     }
 971                 }
 972                 break;
 973                 case 32:
 974                 {
 975                     SWR_ASSERT(conversionType == CONVERT_NONE);
 976
 977                     // Gathered components into place in simdvertex struct
 978                     for(uint32_t i = 0; i < 4; i++)
 979                     {
 980                         if(!isComponentEnabled(compMask, i)){
 981                             // offset base to the next component in the vertex to gather
 982                             pStreamBase = GEP(pStreamBase, C((char)4));
 983                             continue;
 984                         }
 985
 986                         // if we need to gather the component
 987                         if(compCtrl[i] == StoreSrc){
 988                             // save mask as it is zero'd out after each gather
 989                             Value *vMask = vGatherMask;
 990
 991                             vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 992
 993                             // e.g. result of a single 8x32bit integer gather for 32bit components
 994                             // 256i - 0    1    2    3    4    5    6    7
 995                             //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
 996                         }
 997                         else{
 998                             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 999                         }
1000
1001                         if(currentVertexElement > 3){
1002                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1003                             // reset to the next vVertexElement to output
1004                             currentVertexElement = 0;
1005                         }
1006
1007                         // offset base to the next component  in the vertex to gather
1008                         pStreamBase = GEP(pStreamBase, C((char)4));
1009                     }
1010                 }
1011                 break;
1012             }
1013         }
1014     }
1015
1016     // if we have a partially filled vVertexElement struct, output it
1017     if(currentVertexElement > 0){
1018         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement+1, vVertexElements);
1019     }
1020 }
1021
1022 //////////////////////////////////////////////////////////////////////////
1023 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1024 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1025 /// support
1026 /// @param pIndices - pointer to 8 bit indices
1027 /// @param pLastIndex - pointer to last valid index
1028 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1029 {
1030     // can fit 2 16 bit integers per vWidth lane
1031     Value* vIndices =  VUNDEF_I();
1032
1033     // store 0 index on stack to be used to conditionally load from if index address is OOB
1034     Value* pZeroIndex = ALLOCA(mInt8Ty);
1035     STORE(C((uint8_t)0), pZeroIndex);
1036
1037     // Load a SIMD of index pointers
1038     for(int64_t lane = 0; lane < mVWidth; lane++)
1039     {
1040         // Calculate the address of the requested index
1041         Value *pIndex = GEP(pIndices, C(lane));
1042
1043         // check if the address is less than the max index,
1044         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1045
1046         // if valid, load the index. if not, load 0 from the stack
1047         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1048         Value *index = LOAD(pValid, "valid index");
1049
1050         // zero extended index to 32 bits and insert into the correct simd lane
1051         index = Z_EXT(index, mInt32Ty);
1052         vIndices = VINSERT(vIndices, index, lane);
1053     }
1054     return vIndices;
1055 }
1056
1057 //////////////////////////////////////////////////////////////////////////
1058 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1059 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1060 /// support
1061 /// @param pIndices - pointer to 16 bit indices
1062 /// @param pLastIndex - pointer to last valid index
1063 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1064 {
1065     // can fit 2 16 bit integers per vWidth lane
1066     Value* vIndices =  VUNDEF_I();
1067
1068     // store 0 index on stack to be used to conditionally load from if index address is OOB
1069     Value* pZeroIndex = ALLOCA(mInt16Ty);
1070     STORE(C((uint16_t)0), pZeroIndex);
1071
1072     // Load a SIMD of index pointers
1073     for(int64_t lane = 0; lane < mVWidth; lane++)
1074     {
1075         // Calculate the address of the requested index
1076         Value *pIndex = GEP(pIndices, C(lane));
1077
1078         // check if the address is less than the max index,
1079         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1080
1081         // if valid, load the index. if not, load 0 from the stack
1082         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1083         Value *index = LOAD(pValid, "valid index");
1084
1085         // zero extended index to 32 bits and insert into the correct simd lane
1086         index = Z_EXT(index, mInt32Ty);
1087         vIndices = VINSERT(vIndices, index, lane);
1088     }
1089     return vIndices;
1090 }
1091
1092 //////////////////////////////////////////////////////////////////////////
1093 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1094 /// @param pIndices - pointer to 32 bit indices
1095 /// @param pLastIndex - pointer to last valid index
1096 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1097 {
1098     DataLayout dL(JM()->mpCurrentModule);
1099     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1100     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1101     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1102
1103     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1104     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1105     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1106     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1107
1108     // create a vector of index counts from the base index ptr passed into the fetch
1109     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1110     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1111
1112     // compare index count to the max valid index
1113     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1114     //     vIndexOffsets  0 1 2 3 4 5 6 7
1115     //     ------------------------------
1116     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1117     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1118     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1119     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1120
1121     // VMASKLOAD takes an *i8 src pointer
1122     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1123
1124     // Load the indices; OOB loads 0
1125     return MASKLOADD(pIndices,vIndexMask);
1126 }
1127
1128 //////////////////////////////////////////////////////////////////////////
1129 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1130 /// denormalizes if needed, converts to F32 if needed, and positions in
1131 //  the proper SIMD rows to be output to the simdvertex structure
1132 /// @param args: (tuple of args, listed below)
1133 ///   @param vGatherResult - 8 gathered 8bpc vertices
1134 ///   @param pVtxOut - base pointer to output simdvertex struct
1135 ///   @param extendType - sign extend or zero extend
1136 ///   @param bNormalized - do we need to denormalize?
1137 ///   @param currentVertexElement - reference to the current vVertexElement
1138 ///   @param outputElt - reference to the current offset from simdvertex we're o
1139 ///   @param compMask - component packing mask
1140 ///   @param compCtrl - component control val
1141 ///   @param vVertexElements[4] - vertex components to output
1142 ///   @param swizzle[4] - component swizzle location
1143 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1144 {
1145     // Unpack tuple args
1146     Value*& vGatherResult = std::get<0>(args);
1147     Value* pVtxOut = std::get<1>(args);
1148     const Instruction::CastOps extendType = std::get<2>(args);
1149     const ConversionType conversionType = std::get<3>(args);
1150     uint32_t &currentVertexElement = std::get<4>(args);
1151     uint32_t &outputElt =  std::get<5>(args);
1152     const ComponentEnable compMask = std::get<6>(args);
1153     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1154     Value* (&vVertexElements)[4] = std::get<8>(args);
1155     const uint32_t (&swizzle)[4] = std::get<9>(args);
1156
1157     // cast types
1158     Type* vGatherTy = mSimdInt32Ty;
1159     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1160
1161     // have to do extra work for sign extending
1162     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1163         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1164         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1165
1166         // shuffle mask, including any swizzling
1167         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1168         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1169         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1170                     char(y), char(y+4), char(y+8), char(y+12),
1171                     char(z), char(z+4), char(z+8), char(z+12),
1172                     char(w), char(w+4), char(w+8), char(w+12),
1173                     char(x), char(x+4), char(x+8), char(x+12),
1174                     char(y), char(y+4), char(y+8), char(y+12),
1175                     char(z), char(z+4), char(z+8), char(z+12),
1176                     char(w), char(w+4), char(w+8), char(w+12)});
1177
1178         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1179         // after pshufb: group components together in each 128bit lane
1180         // 256i - 0    1    2    3    4    5    6    7
1181         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1182
1183         Value* vi128XY = nullptr;
1184         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1185             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1186             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1187             // 256i - 0    1    2    3    4    5    6    7
1188             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1189         }
1190
1191         // do the same for zw components
1192         Value* vi128ZW = nullptr;
1193         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1194             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1195         }
1196
1197         // init denormalize variables if needed
1198         Instruction::CastOps fpCast;
1199         Value* conversionFactor;
1200
1201         switch (conversionType)
1202         {
1203         case CONVERT_NORMALIZED:
1204             fpCast = Instruction::CastOps::SIToFP;
1205             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1206             break;
1207         case CONVERT_SSCALED:
1208             fpCast = Instruction::CastOps::SIToFP;
1209             conversionFactor = VIMMED1((float)(1.0));
1210             break;
1211         case CONVERT_USCALED:
1212             SWR_ASSERT(0, "Type should not be sign extended!");
1213             conversionFactor = nullptr;
1214             break;
1215         default:
1216             SWR_ASSERT(conversionType == CONVERT_NONE);
1217             conversionFactor = nullptr;
1218             break;
1219         }
1220
1221         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1222         for(uint32_t i = 0; i < 4; i++){
1223             if(!isComponentEnabled(compMask, i)){
1224                 continue;
1225             }
1226
1227             if(compCtrl[i] == ComponentControl::StoreSrc){
1228                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1229                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1230                 // if x or y, use vi128XY permute result, else use vi128ZW
1231                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1232
1233                 // sign extend
1234                 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1235
1236                 // denormalize if needed
1237                 if(conversionType != CONVERT_NONE){
1238                     vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1239                 }
1240                 currentVertexElement++;
1241             }
1242             else{
1243                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1244             }
1245
1246             if(currentVertexElement > 3){
1247                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1248                 // reset to the next vVertexElement to output
1249                 currentVertexElement = 0;
1250             }
1251         }
1252     }
1253     // else zero extend
1254     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1255     {
1256         // init denormalize variables if needed
1257         Instruction::CastOps fpCast;
1258         Value* conversionFactor;
1259
1260         switch (conversionType)
1261         {
1262         case CONVERT_NORMALIZED:
1263             fpCast = Instruction::CastOps::UIToFP;
1264             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1265             break;
1266         case CONVERT_USCALED:
1267             fpCast = Instruction::CastOps::UIToFP;
1268             conversionFactor = VIMMED1((float)(1.0));
1269             break;
1270         case CONVERT_SSCALED:
1271             SWR_ASSERT(0, "Type should not be zero extended!");
1272             conversionFactor = nullptr;
1273             break;
1274         default:
1275             SWR_ASSERT(conversionType == CONVERT_NONE);
1276             conversionFactor = nullptr;
1277             break;
1278         }
1279
1280         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1281         for(uint32_t i = 0; i < 4; i++){
1282             if(!isComponentEnabled(compMask, i)){
1283                 continue;
1284             }
1285
1286             if(compCtrl[i] == ComponentControl::StoreSrc){
1287                 // pshufb masks for each component
1288                 Value* vConstMask;
1289                 switch(swizzle[i]){
1290                     case 0:
1291                         // x shuffle mask
1292                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1293                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1294                         break;
1295                     case 1:
1296                         // y shuffle mask
1297                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1298                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1299                         break;
1300                     case 2:
1301                         // z shuffle mask
1302                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1303                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1304                         break;
1305                     case 3:
1306                         // w shuffle mask
1307                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1308                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1309                         break;
1310                     default:
1311                         vConstMask = nullptr;
1312                         break;
1313                 }
1314
1315                 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1316                 // after pshufb for x channel
1317                 // 256i - 0    1    2    3    4    5    6    7
1318                 //        x000 x000 x000 x000 x000 x000 x000 x000
1319
1320                 // denormalize if needed
1321                 if (conversionType != CONVERT_NONE){
1322                     vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1323                 }
1324                 currentVertexElement++;
1325             }
1326             else{
1327                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1328             }
1329
1330             if(currentVertexElement > 3){
1331                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1332                 // reset to the next vVertexElement to output
1333                 currentVertexElement = 0;
1334             }
1335         }
1336     }
1337     else
1338     {
1339         SWR_ASSERT(0, "Unsupported conversion type");
1340     }
1341 }
1342
1343 //////////////////////////////////////////////////////////////////////////
1344 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1345 /// denormalizes if needed, converts to F32 if needed, and positions in
1346 //  the proper SIMD rows to be output to the simdvertex structure
1347 /// @param args: (tuple of args, listed below)
1348 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1349 ///   @param pVtxOut - base pointer to output simdvertex struct
1350 ///   @param extendType - sign extend or zero extend
1351 ///   @param bNormalized - do we need to denormalize?
1352 ///   @param currentVertexElement - reference to the current vVertexElement
1353 ///   @param outputElt - reference to the current offset from simdvertex we're o
1354 ///   @param compMask - component packing mask
1355 ///   @param compCtrl - component control val
1356 ///   @param vVertexElements[4] - vertex components to output
1357 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1358 {
1359     // Unpack tuple args
1360     Value* (&vGatherResult)[2] = std::get<0>(args);
1361     Value* pVtxOut = std::get<1>(args);
1362     const Instruction::CastOps extendType = std::get<2>(args);
1363     const ConversionType conversionType = std::get<3>(args);
1364     uint32_t &currentVertexElement = std::get<4>(args);
1365     uint32_t &outputElt = std::get<5>(args);
1366     const ComponentEnable compMask = std::get<6>(args);
1367     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1368     Value* (&vVertexElements)[4] = std::get<8>(args);
1369
1370     // cast types
1371     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1372     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1373
1374     // have to do extra work for sign extending
1375     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1376         (extendType == Instruction::CastOps::FPExt))
1377     {
1378         // is this PP float?
1379         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1380
1381         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1382         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1383
1384         // shuffle mask
1385         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1386                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1387         Value* vi128XY = nullptr;
1388         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1389             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1390             // after pshufb: group components together in each 128bit lane
1391             // 256i - 0    1    2    3    4    5    6    7
1392             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1393
1394             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1395             // after PERMD: move and pack xy components into each 128bit lane
1396             // 256i - 0    1    2    3    4    5    6    7
1397             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1398         }
1399
1400         // do the same for zw components
1401         Value* vi128ZW = nullptr;
1402         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1403             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1404             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1405         }
1406
1407         // init denormalize variables if needed
1408         Instruction::CastOps IntToFpCast;
1409         Value* conversionFactor;
1410
1411         switch (conversionType)
1412         {
1413         case CONVERT_NORMALIZED:
1414             IntToFpCast = Instruction::CastOps::SIToFP;
1415             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1416             break;
1417         case CONVERT_SSCALED:
1418             IntToFpCast = Instruction::CastOps::SIToFP;
1419             conversionFactor = VIMMED1((float)(1.0));
1420             break;
1421         case CONVERT_USCALED:
1422             SWR_ASSERT(0, "Type should not be sign extended!");
1423             conversionFactor = nullptr;
1424             break;
1425         default:
1426             SWR_ASSERT(conversionType == CONVERT_NONE);
1427             conversionFactor = nullptr;
1428             break;
1429         }
1430
1431         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1432         for(uint32_t i = 0; i < 4; i++){
1433             if(!isComponentEnabled(compMask, i)){
1434                 continue;
1435             }
1436
1437             if(compCtrl[i] == ComponentControl::StoreSrc){
1438                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1439                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1440                 // if x or y, use vi128XY permute result, else use vi128ZW
1441                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1442
1443                 if(bFP) {
1444                     // extract 128 bit lanes to sign extend each component
1445                     vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1446                 }
1447                 else {
1448                     // extract 128 bit lanes to sign extend each component
1449                     vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1450
1451                     // denormalize if needed
1452                     if(conversionType != CONVERT_NONE){
1453                         vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1454                     }
1455                 }
1456                 currentVertexElement++;
1457             }
1458             else{
1459                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1460             }
1461
1462             if(currentVertexElement > 3){
1463                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1464                 // reset to the next vVertexElement to output
1465                 currentVertexElement = 0;
1466             }
1467         }
1468
1469     }
1470     // else zero extend
1471     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1472     {
1473         // pshufb masks for each component
1474         Value* vConstMask[2];
1475         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1476             // x/z shuffle mask
1477             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1478                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1479         }
1480
1481         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1482             // y/w shuffle mask
1483             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1484                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1485         }
1486
1487         // init denormalize variables if needed
1488         Instruction::CastOps fpCast;
1489         Value* conversionFactor;
1490
1491         switch (conversionType)
1492         {
1493         case CONVERT_NORMALIZED:
1494             fpCast = Instruction::CastOps::UIToFP;
1495             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1496             break;
1497         case CONVERT_USCALED:
1498             fpCast = Instruction::CastOps::UIToFP;
1499             conversionFactor = VIMMED1((float)(1.0f));
1500             break;
1501         case CONVERT_SSCALED:
1502             SWR_ASSERT(0, "Type should not be zero extended!");
1503             conversionFactor = nullptr;
1504             break;
1505         default:
1506             SWR_ASSERT(conversionType == CONVERT_NONE);
1507             conversionFactor = nullptr;
1508             break;
1509         }
1510
1511         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1512         for(uint32_t i = 0; i < 4; i++){
1513             if(!isComponentEnabled(compMask, i)){
1514                 continue;
1515             }
1516
1517             if(compCtrl[i] == ComponentControl::StoreSrc){
1518                 // select correct constMask for x/z or y/w pshufb
1519                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1520                 // if x or y, use vi128XY permute result, else use vi128ZW
1521                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1522
1523                 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1524                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1525                 // 256i - 0    1    2    3    4    5    6    7
1526                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1527
1528                 // denormalize if needed
1529                 if(conversionType != CONVERT_NONE){
1530                     vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1531                 }
1532                 currentVertexElement++;
1533             }
1534             else{
1535                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1536             }
1537
1538             if(currentVertexElement > 3){
1539                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1540                 // reset to the next vVertexElement to output
1541                 currentVertexElement = 0;
1542             }
1543         }
1544     }
1545     else
1546     {
1547         SWR_ASSERT(0, "Unsupported conversion type");
1548     }
1549 }
1550
1551 //////////////////////////////////////////////////////////////////////////
1552 /// @brief Output a simdvertex worth of elements to the current outputElt
1553 /// @param pVtxOut - base address of VIN output struct
1554 /// @param outputElt - simdvertex offset in VIN to write to
1555 /// @param numEltsToStore - number of simdvertex rows to write out
1556 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1557 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1558 {
1559     for(uint32_t c = 0; c < numEltsToStore; ++c)
1560     {
1561         // STORE expects FP32 x vWidth type, just bitcast if needed
1562         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1563 #if FETCH_DUMP_VERTEX
1564             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1565 #endif
1566             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1567         }
1568 #if FETCH_DUMP_VERTEX
1569         else
1570         {
1571             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1572         }
1573 #endif
1574         // outputElt * 4 = offsetting by the size of a simdvertex
1575         // + c offsets to a 32bit x vWidth row within the current vertex
1576         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1577         STORE(vVertexElements[c], dest);
1578     }
1579 }
1580
1581 //////////////////////////////////////////////////////////////////////////
1582 /// @brief Generates a constant vector of values based on the
1583 /// ComponentControl value
1584 /// @param ctrl - ComponentControl value
1585 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1586 {
1587     switch(ctrl)
1588     {
1589         case NoStore:   return VUNDEF_I();
1590         case Store0:    return VIMMED1(0);
1591         case Store1Fp:  return VIMMED1(1.0f);
1592         case Store1Int: return VIMMED1(1);
1593         case StoreSrc:
1594         default:        SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1595     }
1596 }
1597
1598 //////////////////////////////////////////////////////////////////////////
1599 /// @brief Returns the enable mask for the specified component.
1600 /// @param enableMask - enable bits
1601 /// @param component - component to check if enabled.
1602 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1603 {
1604     switch (component)
1605     {
1606         // X
1607     case 0: return (enableMask & ComponentEnable::X);
1608         // Y
1609     case 1: return (enableMask & ComponentEnable::Y);
1610         // Z
1611     case 2: return (enableMask & ComponentEnable::Z);
1612         // W
1613     case 3: return (enableMask & ComponentEnable::W);
1614
1615     default: return false;
1616     }
1617 }
1618
1619
1620 //////////////////////////////////////////////////////////////////////////
1621 /// @brief JITs from fetch shader IR
1622 /// @param hJitMgr - JitManager handle
1623 /// @param func   - LLVM function IR
1624 /// @return PFN_FETCH_FUNC - pointer to fetch code
1625 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1626 {
1627     const llvm::Function* func = (const llvm::Function*)hFunc;
1628     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1629     PFN_FETCH_FUNC pfnFetch;
1630
1631     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1632     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1633     pJitMgr->mIsModuleFinalized = true;
1634
1635 #if defined(KNOB_SWRC_TRACING)
1636     char fName[1024];
1637     const char *funcName = func->getName().data();
1638     sprintf(fName, "%s.bin", funcName);
1639     FILE *fd = fopen(fName, "wb");
1640     fwrite((void *)pfnFetch, 1, 2048, fd);
1641     fclose(fd);
1642 #endif
1643
1644     return pfnFetch;
1645 }
1646
1647 //////////////////////////////////////////////////////////////////////////
1648 /// @brief JIT compiles fetch shader
1649 /// @param hJitMgr - JitManager handle
1650 /// @param state   - fetch state to build function from
1651 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1652 {
1653     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1654
1655     pJitMgr->SetupNewModule();
1656
1657     FetchJit theJit(pJitMgr);
1658     HANDLE hFunc = theJit.Create(state);
1659
1660     return JitFetchFunc(hJitMgr, hFunc);
1661 }