src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_api.h"
  31 #include "fetch_jit.h"
  32 #include "builder.h"
  33 #include "state_llvm.h"
  34 #include <sstream>
  35 #include <tuple>
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38
  39 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  40
  41 enum ConversionType
  42 {
  43     CONVERT_NONE,
  44     CONVERT_NORMALIZED,
  45     CONVERT_USCALED,
  46     CONVERT_SSCALED,
  47 };
  48
  49 //////////////////////////////////////////////////////////////////////////
  50 /// Interface to Jitting a fetch shader
  51 //////////////////////////////////////////////////////////////////////////
  52 struct FetchJit : public Builder
  53 {
  54     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  55
  56     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  57     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  58     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  59     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  60
  61     // package up Shuffle*bpcGatherd args into a tuple for convenience
  62     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  63         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  64         const uint32_t(&)[4]> Shuffle8bpcArgs;
  65     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  66
  67     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  68         uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  69     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  70
  71     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  72
  73     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  74
  75     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  76     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
  77
  78     bool IsOddFormat(SWR_FORMAT format);
  79     bool IsUniformFormat(SWR_FORMAT format);
  80     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
  81     void CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4]);
  82     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
  83
  84     Value* mpFetchInfo;
  85 };
  86
  87 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  88 {
  89     static std::size_t fetchNum = 0;
  90
  91     std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  92     fnName << fetchNum++;
  93
  94     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
  95     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
  96
  97     IRB()->SetInsertPoint(entry);
  98
  99     auto    argitr = fetch->getArgumentList().begin();
 100
 101     // Fetch shader arguments
 102     mpFetchInfo = &*argitr; ++argitr;
 103     mpFetchInfo->setName("fetchInfo");
 104     Value*    pVtxOut = &*argitr;
 105     pVtxOut->setName("vtxOutput");
 106     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 107     // index 0(just the pointer to the simdvertex structure
 108     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 109     // so the indices being i32's doesn't matter
 110     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 111     std::vector<Value*>    vtxInputIndices(2, C(0));
 112     // GEP
 113     pVtxOut = GEP(pVtxOut, C(0));
 114     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 115
 116     // SWR_FETCH_CONTEXT::pStreams
 117     Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 118     streams->setName("pStreams");
 119
 120     // SWR_FETCH_CONTEXT::pIndices
 121     Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 122     indices->setName("pIndices");
 123
 124     // SWR_FETCH_CONTEXT::pLastIndex
 125     Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 126     pLastIndex->setName("pLastIndex");
 127
 128
 129     Value* vIndices;
 130     switch(fetchState.indexType)
 131     {
 132         case R8_UINT:
 133             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 134             if(fetchState.bDisableIndexOOBCheck){
 135                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 136                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 137             }
 138             else{
 139                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 140                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 141             }
 142             break;
 143         case R16_UINT:
 144             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 145             if(fetchState.bDisableIndexOOBCheck){
 146                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 147                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 148             }
 149             else{
 150                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 151                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 152             }
 153             break;
 154         case R32_UINT:
 155             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 156                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 157             break; // incoming type is already 32bit int
 158         default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
 159     }
 160
 161     // store out vertex IDs
 162     STORE(vIndices, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 163
 164     // store out cut mask if enabled
 165     if (fetchState.bEnableCutIndex)
 166     {
 167         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 168         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 169         STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 170     }
 171
 172     // Fetch attributes from memory and output to a simdvertex struct
 173     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 174     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
 175                                  : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 176
 177     RET_VOID();
 178
 179     JitManager::DumpToFile(fetch, "src");
 180
 181 #if defined(_DEBUG)
 182     verifyFunction(*fetch);
 183 #endif
 184
 185     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 186
 187     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 188     setupPasses.add(createBreakCriticalEdgesPass());
 189     setupPasses.add(createCFGSimplificationPass());
 190     setupPasses.add(createEarlyCSEPass());
 191     setupPasses.add(createPromoteMemoryToRegisterPass());
 192
 193     setupPasses.run(*fetch);
 194
 195     JitManager::DumpToFile(fetch, "se");
 196
 197     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 198
 199     ///@todo Haven't touched these either. Need to remove some of these and add others.
 200     optPasses.add(createCFGSimplificationPass());
 201     optPasses.add(createEarlyCSEPass());
 202     optPasses.add(createInstructionCombiningPass());
 203     optPasses.add(createInstructionSimplifierPass());
 204     optPasses.add(createConstantPropagationPass());
 205     optPasses.add(createSCCPPass());
 206     optPasses.add(createAggressiveDCEPass());
 207
 208     optPasses.run(*fetch);
 209     optPasses.run(*fetch);
 210
 211     JitManager::DumpToFile(fetch, "opt");
 212
 213     return fetch;
 214 }
 215
 216 //////////////////////////////////////////////////////////////////////////
 217 /// @brief Loads attributes from memory using LOADs, shuffling the
 218 /// components into SOA form.
 219 /// *Note* currently does not support component control,
 220 /// component packing, instancing
 221 /// @param fetchState - info about attributes to be fetched from memory
 222 /// @param streams - value pointer to the current vertex stream
 223 /// @param vIndices - vector value of indices to load
 224 /// @param pVtxOut - value pointer to output simdvertex struct
 225 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
 226 {
 227     // Zack shuffles; a variant of the Charleston.
 228
 229     std::vector<Value*> vectors(16);
 230     std::vector<Constant*>    pMask(mVWidth);
 231     for(uint32_t i = 0; i < mVWidth; ++i)
 232     {
 233         pMask[i] = (C(i < 4 ? i : 4));
 234     }
 235     Constant* promoteMask = ConstantVector::get(pMask);
 236     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 237
 238     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 239     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 240     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 241     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 242     curInstance->setName("curInstance");
 243
 244     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 245     {
 246         Value*    elements[4] = {0};
 247         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 248         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 249         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
 250         uint32_t    numComponents = info.numComps;
 251         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 252
 253         // load path doesn't support component packing
 254         SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
 255
 256         vectors.clear();
 257
 258         Value *vCurIndices;
 259         Value *startOffset;
 260         if(ied.InstanceEnable)
 261         {
 262             Value* stepRate = C(ied.InstanceDataStepRate);
 263
 264             // prevent a div by 0 for 0 step rate
 265             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 266             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 267
 268             // calc the current offset into instanced data buffer
 269             Value* calcInstance = UDIV(curInstance, stepRate);
 270
 271             // if step rate is 0, every instance gets instance 0
 272             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 273
 274             vCurIndices = VBROADCAST(calcInstance);
 275
 276             startOffset = startInstance;
 277         }
 278         else
 279         {
 280             // offset indices by baseVertex
 281             vCurIndices = ADD(vIndices, vBaseVertex);
 282
 283             startOffset = startVertex;
 284         }
 285
 286         // load SWR_VERTEX_BUFFER_STATE::pData
 287         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 288
 289         // load SWR_VERTEX_BUFFER_STATE::pitch
 290         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 291         stride = Z_EXT(stride, mInt64Ty);
 292
 293         // load SWR_VERTEX_BUFFER_STATE::size
 294         Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
 295         size = Z_EXT(size, mInt64Ty);
 296
 297         Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
 298
 299         // Load from the stream.
 300         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 301         {
 302             // Get index
 303             Value* index = VEXTRACT(vCurIndices, C(lane));
 304             index = Z_EXT(index, mInt64Ty);
 305
 306             Value*    offset = MUL(index, stride);
 307             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 308             offset = ADD(offset, startVertexOffset);
 309
 310             if (!fetchState.bDisableIndexOOBCheck) {
 311                 // check for out of bound access, including partial OOB, and mask them to 0
 312                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 313                 Value *oob = ICMP_ULE(endOffset, size);
 314                 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 315             }
 316
 317             Value*    pointer = GEP(stream, offset);
 318             // We use a full-lane, but don't actually care.
 319             Value*    vptr = 0;
 320
 321             // get a pointer to a 4 component attrib in default address space
 322             switch(bpc)
 323             {
 324                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 325                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 326                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 327                 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
 328             }
 329
 330             // load 4 components of attribute
 331             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 332
 333             // Convert To FP32 internally
 334             switch(info.type[0])
 335             {
 336                 case SWR_TYPE_UNORM:
 337                     switch(bpc)
 338                     {
 339                         case 8:
 340                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 341                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 342                             break;
 343                         case 16:
 344                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 345                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 346                             break;
 347                         default:
 348                             SWR_ASSERT(false, "Unsupported underlying type!");
 349                             break;
 350                     }
 351                     break;
 352                 case SWR_TYPE_SNORM:
 353                     switch(bpc)
 354                     {
 355                         case 8:
 356                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 357                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 358                             break;
 359                         case 16:
 360                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 361                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 362                             break;
 363                         default:
 364                             SWR_ASSERT(false, "Unsupported underlying type!");
 365                             break;
 366                     }
 367                     break;
 368                 case SWR_TYPE_UINT:
 369                     // Zero extend uint32_t types.
 370                     switch(bpc)
 371                     {
 372                         case 8:
 373                         case 16:
 374                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 375                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 376                             break;
 377                         case 32:
 378                             break; // Pass through unchanged.
 379                         default:
 380                             SWR_ASSERT(false, "Unsupported underlying type!");
 381                             break;
 382                     }
 383                     break;
 384                 case SWR_TYPE_SINT:
 385                     // Sign extend SINT types.
 386                     switch(bpc)
 387                     {
 388                         case 8:
 389                         case 16:
 390                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 391                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 392                             break;
 393                         case 32:
 394                             break; // Pass through unchanged.
 395                         default:
 396                             SWR_ASSERT(false, "Unsupported underlying type!");
 397                             break;
 398                     }
 399                     break;
 400                 case SWR_TYPE_FLOAT:
 401                     switch(bpc)
 402                     {
 403                         case 32:
 404                             break; // Pass through unchanged.
 405                         default:
 406                             SWR_ASSERT(false, "Unsupported underlying type!");
 407                     }
 408                     break;
 409                 case SWR_TYPE_USCALED:
 410                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 411                     break;
 412                 case SWR_TYPE_SSCALED:
 413                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 414                     break;
 415                 case SWR_TYPE_UNKNOWN:
 416                 case SWR_TYPE_UNUSED:
 417                     SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
 418             }
 419
 420             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 421             // uwvec: 4 x F32, undef value
 422             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 423             vectors.push_back(wvec);
 424         }
 425
 426         std::vector<Constant*>        v01Mask(mVWidth);
 427         std::vector<Constant*>        v23Mask(mVWidth);
 428         std::vector<Constant*>        v02Mask(mVWidth);
 429         std::vector<Constant*>        v13Mask(mVWidth);
 430
 431         // Concatenate the vectors together.
 432         elements[0] = VUNDEF_F();
 433         elements[1] = VUNDEF_F();
 434         elements[2] = VUNDEF_F();
 435         elements[3] = VUNDEF_F();
 436         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 437         {
 438             v01Mask[4 * b + 0] = C(0 + 4 * b);
 439             v01Mask[4 * b + 1] = C(1 + 4 * b);
 440             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 441             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 442
 443             v23Mask[4 * b + 0] = C(2 + 4 * b);
 444             v23Mask[4 * b + 1] = C(3 + 4 * b);
 445             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 446             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 447
 448             v02Mask[4 * b + 0] = C(0 + 4 * b);
 449             v02Mask[4 * b + 1] = C(2 + 4 * b);
 450             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 451             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 452
 453             v13Mask[4 * b + 0] = C(1 + 4 * b);
 454             v13Mask[4 * b + 1] = C(3 + 4 * b);
 455             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 456             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 457
 458             std::vector<Constant*>    iMask(mVWidth);
 459             for(uint32_t i = 0; i < mVWidth; ++i)
 460             {
 461                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 462                 {
 463                     iMask[i] = C(i % 4 + mVWidth);
 464                 }
 465                 else
 466                 {
 467                     iMask[i] = C(i);
 468                 }
 469             }
 470             Constant* insertMask = ConstantVector::get(iMask);
 471             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 472             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 473             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 474             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 475         }
 476
 477         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 478         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 479         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 480         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 481         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 482         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 483         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 484         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 485
 486         switch(numComponents + 1)
 487         {
 488             case    1: elements[0] = VIMMED1(0.0f);
 489             case    2: elements[1] = VIMMED1(0.0f);
 490             case    3: elements[2] = VIMMED1(0.0f);
 491             case    4: elements[3] = VIMMED1(1.0f);
 492         }
 493
 494         for(uint32_t c = 0; c < 4; ++c)
 495         {
 496             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 497             STORE(elements[c], dest);
 498         }
 499     }
 500 }
 501
 502 // returns true for odd formats that require special state.gather handling
 503 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 504 {
 505     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 506     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
 507     {
 508         return true;
 509     }
 510     return false;
 511 }
 512
 513 // format is uniform if all components are the same size and type
 514 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 515 {
 516     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 517     uint32_t bpc0 = info.bpc[0];
 518     uint32_t type0 = info.type[0];
 519
 520     for (uint32_t c = 1; c < info.numComps; ++c)
 521     {
 522         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 523         {
 524             return false;
 525         }
 526     }
 527     return true;
 528 }
 529
 530 // unpacks components based on format
 531 // foreach component in the pixel
 532 //   mask off everything but this component
 533 //   shift component to LSB
 534 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 535 {
 536     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 537
 538     uint32_t bitOffset = 0;
 539     for (uint32_t c = 0; c < info.numComps; ++c)
 540     {
 541         uint32_t swizzledIndex = info.swizzle[c];
 542         uint32_t compBits = info.bpc[c];
 543         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 544         Value* comp = AND(vInput, bitmask);
 545         comp = LSHR(comp, bitOffset);
 546
 547         result[swizzledIndex] = comp;
 548         bitOffset += compBits;
 549     }
 550 }
 551
 552 // gather for odd component size formats
 553 // gather SIMD full pixels per lane then shift/mask to move each component to their
 554 // own vector
 555 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4])
 556 {
 557     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 558
 559     // only works if pixel size is <= 32bits
 560     SWR_ASSERT(info.bpp <= 32);
 561
 562     Value* gather = VUNDEF_I();
 563
 564     // assign defaults
 565     for (uint32_t comp = 0; comp < 4; ++comp)
 566     {
 567         result[comp] = VIMMED1((int)info.defaults[comp]);
 568     }
 569
 570     // gather SIMD pixels
 571     for (uint32_t e = 0; e < JM()->mVWidth; ++e)
 572     {
 573         Value* elemOffset = VEXTRACT(offsets, C(e));
 574         Value* load = GEP(pBase, elemOffset);
 575
 576         // load the proper amount of data based on component size
 577         switch (info.bpp)
 578         {
 579         case 8: load = POINTER_CAST(load, Type::getInt8PtrTy(JM()->mContext)); break;
 580         case 16: load = POINTER_CAST(load, Type::getInt16PtrTy(JM()->mContext)); break;
 581         case 24:
 582         case 32: load = POINTER_CAST(load, Type::getInt32PtrTy(JM()->mContext)); break;
 583         default: SWR_ASSERT(0);
 584         }
 585
 586         // load pixel
 587         Value *val = LOAD(load);
 588
 589         // zero extend to 32bit integer
 590         val = INT_CAST(val, mInt32Ty, false);
 591
 592         // store in simd lane
 593         gather = VINSERT(gather, val, C(e));
 594     }
 595
 596     UnpackComponents(format, gather, result);
 597
 598     // cast to fp32
 599     result[0] = BITCAST(result[0], mSimdFP32Ty);
 600     result[1] = BITCAST(result[1], mSimdFP32Ty);
 601     result[2] = BITCAST(result[2], mSimdFP32Ty);
 602     result[3] = BITCAST(result[3], mSimdFP32Ty);
 603 }
 604
 605 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 606 {
 607     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 608
 609     for (uint32_t c = 0; c < info.numComps; ++c)
 610     {
 611         uint32_t compIndex = info.swizzle[c];
 612
 613         // skip any conversion on UNUSED components
 614         if (info.type[c] == SWR_TYPE_UNUSED)
 615         {
 616             continue;
 617         }
 618
 619         if (info.isNormalized[c])
 620         {
 621             if (info.type[c] == SWR_TYPE_SNORM)
 622             {
 623                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 624
 625                 /// result = c * (1.0f / (2^(n-1) - 1);
 626                 uint32_t n = info.bpc[c];
 627                 uint32_t pow2 = 1 << (n - 1);
 628                 float scale = 1.0f / (float)(pow2 - 1);
 629                 Value *vScale = VIMMED1(scale);
 630                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 631                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 632                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 633             }
 634             else
 635             {
 636                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 637
 638                 /// result = c * (1.0f / (2^n - 1))
 639                 uint32_t n = info.bpc[c];
 640                 uint32_t pow2 = 1 << n;
 641                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 642                 if (n == 24)
 643                 {
 644                     float scale = (float)(pow2 - 1);
 645                     Value* vScale = VIMMED1(scale);
 646                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 647                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 648                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 649                 }
 650                 else
 651                 {
 652                     float scale = 1.0f / (float)(pow2 - 1);
 653                     Value *vScale = VIMMED1(scale);
 654                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 655                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 656                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 657                 }
 658             }
 659             continue;
 660         }
 661     }
 662 }
 663
 664 //////////////////////////////////////////////////////////////////////////
 665 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 666 /// @param fetchState - info about attributes to be fetched from memory
 667 /// @param streams - value pointer to the current vertex stream
 668 /// @param vIndices - vector value of indices to gather
 669 /// @param pVtxOut - value pointer to output simdvertex struct
 670 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
 671                                  Value* streams, Value* vIndices, Value* pVtxOut)
 672 {
 673     uint32_t currentVertexElement = 0;
 674     uint32_t outputElt = 0;
 675     Value* vVertexElements[4];
 676
 677     Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 678     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 679     Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 680     Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 681     curInstance->setName("curInstance");
 682
 683     for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
 684     {
 685         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 686
 687         // skip element if all components are disabled
 688         if (ied.ComponentPacking == ComponentEnable::NONE)
 689         {
 690             continue;
 691         }
 692
 693         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 694         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 695         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 696
 697         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 698
 699         // VGATHER* takes an *i8 src pointer
 700         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 701
 702         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 703         Value *vStride = VBROADCAST(stride);
 704
 705         // max vertex index that is fully in bounds
 706         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 707         maxVertex = LOAD(maxVertex);
 708
 709         Value *vCurIndices;
 710         Value *startOffset;
 711         if(ied.InstanceEnable)
 712         {
 713             Value* stepRate = C(ied.InstanceDataStepRate);
 714
 715             // prevent a div by 0 for 0 step rate
 716             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 717             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 718
 719             // calc the current offset into instanced data buffer
 720             Value* calcInstance = UDIV(curInstance, stepRate);
 721
 722             // if step rate is 0, every instance gets instance 0
 723             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 724
 725             vCurIndices = VBROADCAST(calcInstance);
 726
 727             startOffset = startInstance;
 728         }
 729         else
 730         {
 731             // offset indices by baseVertex
 732             vCurIndices = ADD(vIndices, vBaseVertex);
 733
 734             startOffset = startVertex;
 735         }
 736
 737         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 738         // do 64bit address offset calculations.
 739
 740         // calculate byte offset to the start of the VB
 741         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 742         pStreamBase = GEP(pStreamBase, baseOffset);
 743
 744         // if we have a start offset, subtract from max vertex. Used for OOB check
 745         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 746         Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
 747         // if we have a negative value, we're already OOB. clamp at 0.
 748         maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
 749
 750         // Load the in bounds size of a partially valid vertex
 751         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 752         partialInboundsSize = LOAD(partialInboundsSize);
 753         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 754         Value* vBpp = VBROADCAST(C(info.Bpp));
 755         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 756
 757         // is the element is <= the partially valid size
 758         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 759
 760         // override cur indices with 0 if pitch is 0
 761         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 762         vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 763
 764         // are vertices partially OOB?
 765         Value* vMaxVertex = VBROADCAST(maxVertex);
 766         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 767
 768         // are vertices are fully in bounds?
 769         Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 770
 771         // blend in any partially OOB indices that have valid elements
 772         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 773         vGatherMask = VMASK(vGatherMask);
 774
 775         // calculate the actual offsets into the VB
 776         Value* vOffsets = MUL(vCurIndices, vStride);
 777         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 778
 779         // Packing and component control
 780         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 781         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 782                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 783
 784         // Special gather/conversion for formats without equal component sizes
 785         if (IsOddFormat((SWR_FORMAT)ied.Format))
 786         {
 787             Value* pResults[4];
 788             CreateGatherOddFormats((SWR_FORMAT)ied.Format, pStreamBase, vOffsets, pResults);
 789             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 790
 791             for (uint32_t c = 0; c < 4; ++c)
 792             {
 793                 if (isComponentEnabled(compMask, c))
 794                 {
 795                     vVertexElements[currentVertexElement++] = pResults[c];
 796                     if (currentVertexElement > 3)
 797                     {
 798                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 799                         // reset to the next vVertexElement to output
 800                         currentVertexElement = 0;
 801                     }
 802                 }
 803             }
 804         }
 805         else if(info.type[0] == SWR_TYPE_FLOAT)
 806         {
 807             ///@todo: support 64 bit vb accesses
 808             Value* gatherSrc = VIMMED1(0.0f);
 809
 810             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 811                 "Unsupported format for standard gather fetch.");
 812
 813             // Gather components from memory to store in a simdvertex structure
 814             switch(bpc)
 815             {
 816                 case 16:
 817                 {
 818                     Value* vGatherResult[2];
 819                     Value *vMask;
 820
 821                     // if we have at least one component out of x or y to fetch
 822                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 823                         // save mask as it is zero'd out after each gather
 824                         vMask = vGatherMask;
 825
 826                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 827                         // e.g. result of first 8x32bit integer gather for 16bit components
 828                         // 256i - 0    1    2    3    4    5    6    7
 829                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 830                         //
 831                     }
 832
 833                     // if we have at least one component out of z or w to fetch
 834                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 835                         // offset base to the next components(zw) in the vertex to gather
 836                         pStreamBase = GEP(pStreamBase, C((char)4));
 837                         vMask = vGatherMask;
 838
 839                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 840                         // e.g. result of second 8x32bit integer gather for 16bit components
 841                         // 256i - 0    1    2    3    4    5    6    7
 842                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 843                         //
 844                     }
 845
 846                     // if we have at least one component to shuffle into place
 847                     if(compMask){
 848                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 849                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 850
 851                         // Shuffle gathered components into place in simdvertex struct
 852                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 853                     }
 854                 }
 855                     break;
 856                 case 32:
 857                 {
 858                     for (uint32_t i = 0; i < 4; i++)
 859                     {
 860                         if (isComponentEnabled(compMask, i))
 861                         {
 862                             // if we need to gather the component
 863                             if (compCtrl[i] == StoreSrc)
 864                             {
 865                                 // save mask as it is zero'd out after each gather
 866                                 Value *vMask = vGatherMask;
 867
 868                                 // Gather a SIMD of vertices
 869                                 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 870                             }
 871                             else
 872                             {
 873                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 874                             }
 875
 876                             if (currentVertexElement > 3)
 877                             {
 878                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 879                                 // reset to the next vVertexElement to output
 880                                 currentVertexElement = 0;
 881                             }
 882
 883                         }
 884
 885                         // offset base to the next component in the vertex to gather
 886                         pStreamBase = GEP(pStreamBase, C((char)4));
 887                     }
 888                 }
 889                     break;
 890                 default:
 891                     SWR_ASSERT(0, "Tried to fetch invalid FP format");
 892                     break;
 893             }
 894         }
 895         else
 896         {
 897             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 898             ConversionType conversionType = CONVERT_NONE;
 899
 900             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 901                 "Unsupported format for standard gather fetch.");
 902
 903             switch(info.type[0])
 904             {
 905                 case SWR_TYPE_UNORM:
 906                     conversionType = CONVERT_NORMALIZED;
 907                 case SWR_TYPE_UINT:
 908                     extendCastType = Instruction::CastOps::ZExt;
 909                     break;
 910                 case SWR_TYPE_SNORM:
 911                     conversionType = CONVERT_NORMALIZED;
 912                 case SWR_TYPE_SINT:
 913                     extendCastType = Instruction::CastOps::SExt;
 914                     break;
 915                 case SWR_TYPE_USCALED:
 916                     conversionType = CONVERT_USCALED;
 917                     extendCastType = Instruction::CastOps::UIToFP;
 918                     break;
 919                 case SWR_TYPE_SSCALED:
 920                     conversionType = CONVERT_SSCALED;
 921                     extendCastType = Instruction::CastOps::SIToFP;
 922                     break;
 923                 default:
 924                     break;
 925             }
 926
 927             // value substituted when component of gather is masked
 928             Value* gatherSrc = VIMMED1(0);
 929
 930             // Gather components from memory to store in a simdvertex structure
 931             switch (bpc)
 932             {
 933                 case 8:
 934                 {
 935                     // if we have at least one component to fetch
 936                     if(compMask)
 937                     {
 938                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
 939                         // e.g. result of an 8x32bit integer gather for 8bit components
 940                         // 256i - 0    1    2    3    4    5    6    7
 941                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 942
 943                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 944                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
 945
 946                         // Shuffle gathered components into place in simdvertex struct
 947                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 948                     }
 949                 }
 950                 break;
 951                 case 16:
 952                 {
 953                     Value* vGatherResult[2];
 954                     Value *vMask;
 955
 956                     // if we have at least one component out of x or y to fetch
 957                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 958                         // save mask as it is zero'd out after each gather
 959                         vMask = vGatherMask;
 960
 961                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 962                         // e.g. result of first 8x32bit integer gather for 16bit components
 963                         // 256i - 0    1    2    3    4    5    6    7
 964                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 965                         //
 966                     }
 967
 968                     // if we have at least one component out of z or w to fetch
 969                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 970                         // offset base to the next components(zw) in the vertex to gather
 971                         pStreamBase = GEP(pStreamBase, C((char)4));
 972                         vMask = vGatherMask;
 973
 974                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 975                         // e.g. result of second 8x32bit integer gather for 16bit components
 976                         // 256i - 0    1    2    3    4    5    6    7
 977                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 978                         //
 979                     }
 980
 981                     // if we have at least one component to shuffle into place
 982                     if(compMask){
 983                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 984                             currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 985
 986                         // Shuffle gathered components into place in simdvertex struct
 987                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 988                     }
 989                 }
 990                 break;
 991                 case 32:
 992                 {
 993                     // Gathered components into place in simdvertex struct
 994                     for (uint32_t i = 0; i < 4; i++)
 995                     {
 996                         if (isComponentEnabled(compMask, i))
 997                         {
 998                             // if we need to gather the component
 999                             if (compCtrl[i] == StoreSrc)
1000                             {
1001                                 // save mask as it is zero'd out after each gather
1002                                 Value *vMask = vGatherMask;
1003
1004                                 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1005
1006                                 if (conversionType == CONVERT_USCALED)
1007                                 {
1008                                     pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1009                                 }
1010                                 else if (conversionType == CONVERT_SSCALED)
1011                                 {
1012                                     pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1013                                 }
1014
1015                                 vVertexElements[currentVertexElement++] = pGather;
1016                                 // e.g. result of a single 8x32bit integer gather for 32bit components
1017                                 // 256i - 0    1    2    3    4    5    6    7
1018                                 //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1019                             }
1020                             else
1021                             {
1022                                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1023                             }
1024
1025                             if (currentVertexElement > 3)
1026                             {
1027                                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1028                                 // reset to the next vVertexElement to output
1029                                 currentVertexElement = 0;
1030                             }
1031
1032                         }
1033
1034                         // offset base to the next component  in the vertex to gather
1035                         pStreamBase = GEP(pStreamBase, C((char)4));
1036                     }
1037                 }
1038                 break;
1039             }
1040         }
1041     }
1042
1043     // if we have a partially filled vVertexElement struct, output it
1044     if(currentVertexElement > 0){
1045         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1046     }
1047 }
1048
1049 //////////////////////////////////////////////////////////////////////////
1050 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1051 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1052 /// support
1053 /// @param pIndices - pointer to 8 bit indices
1054 /// @param pLastIndex - pointer to last valid index
1055 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1056 {
1057     // can fit 2 16 bit integers per vWidth lane
1058     Value* vIndices =  VUNDEF_I();
1059
1060     // store 0 index on stack to be used to conditionally load from if index address is OOB
1061     Value* pZeroIndex = ALLOCA(mInt8Ty);
1062     STORE(C((uint8_t)0), pZeroIndex);
1063
1064     // Load a SIMD of index pointers
1065     for(int64_t lane = 0; lane < mVWidth; lane++)
1066     {
1067         // Calculate the address of the requested index
1068         Value *pIndex = GEP(pIndices, C(lane));
1069
1070         // check if the address is less than the max index,
1071         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1072
1073         // if valid, load the index. if not, load 0 from the stack
1074         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1075         Value *index = LOAD(pValid, "valid index");
1076
1077         // zero extended index to 32 bits and insert into the correct simd lane
1078         index = Z_EXT(index, mInt32Ty);
1079         vIndices = VINSERT(vIndices, index, lane);
1080     }
1081     return vIndices;
1082 }
1083
1084 //////////////////////////////////////////////////////////////////////////
1085 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1086 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1087 /// support
1088 /// @param pIndices - pointer to 16 bit indices
1089 /// @param pLastIndex - pointer to last valid index
1090 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1091 {
1092     // can fit 2 16 bit integers per vWidth lane
1093     Value* vIndices =  VUNDEF_I();
1094
1095     // store 0 index on stack to be used to conditionally load from if index address is OOB
1096     Value* pZeroIndex = ALLOCA(mInt16Ty);
1097     STORE(C((uint16_t)0), pZeroIndex);
1098
1099     // Load a SIMD of index pointers
1100     for(int64_t lane = 0; lane < mVWidth; lane++)
1101     {
1102         // Calculate the address of the requested index
1103         Value *pIndex = GEP(pIndices, C(lane));
1104
1105         // check if the address is less than the max index,
1106         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1107
1108         // if valid, load the index. if not, load 0 from the stack
1109         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1110         Value *index = LOAD(pValid, "valid index");
1111
1112         // zero extended index to 32 bits and insert into the correct simd lane
1113         index = Z_EXT(index, mInt32Ty);
1114         vIndices = VINSERT(vIndices, index, lane);
1115     }
1116     return vIndices;
1117 }
1118
1119 //////////////////////////////////////////////////////////////////////////
1120 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1121 /// @param pIndices - pointer to 32 bit indices
1122 /// @param pLastIndex - pointer to last valid index
1123 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1124 {
1125     DataLayout dL(JM()->mpCurrentModule);
1126     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1127     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1128     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1129
1130     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1131     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1132     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1133     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1134
1135     // create a vector of index counts from the base index ptr passed into the fetch
1136     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1137     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1138
1139     // compare index count to the max valid index
1140     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1141     //     vIndexOffsets  0 1 2 3 4 5 6 7
1142     //     ------------------------------
1143     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1144     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1145     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1146     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1147
1148     // VMASKLOAD takes an *i8 src pointer
1149     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1150
1151     // Load the indices; OOB loads 0
1152     return MASKLOADD(pIndices,vIndexMask);
1153 }
1154
1155 //////////////////////////////////////////////////////////////////////////
1156 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1157 /// denormalizes if needed, converts to F32 if needed, and positions in
1158 //  the proper SIMD rows to be output to the simdvertex structure
1159 /// @param args: (tuple of args, listed below)
1160 ///   @param vGatherResult - 8 gathered 8bpc vertices
1161 ///   @param pVtxOut - base pointer to output simdvertex struct
1162 ///   @param extendType - sign extend or zero extend
1163 ///   @param bNormalized - do we need to denormalize?
1164 ///   @param currentVertexElement - reference to the current vVertexElement
1165 ///   @param outputElt - reference to the current offset from simdvertex we're o
1166 ///   @param compMask - component packing mask
1167 ///   @param compCtrl - component control val
1168 ///   @param vVertexElements[4] - vertex components to output
1169 ///   @param swizzle[4] - component swizzle location
1170 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1171 {
1172     // Unpack tuple args
1173     Value*& vGatherResult = std::get<0>(args);
1174     Value* pVtxOut = std::get<1>(args);
1175     const Instruction::CastOps extendType = std::get<2>(args);
1176     const ConversionType conversionType = std::get<3>(args);
1177     uint32_t &currentVertexElement = std::get<4>(args);
1178     uint32_t &outputElt =  std::get<5>(args);
1179     const ComponentEnable compMask = std::get<6>(args);
1180     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1181     Value* (&vVertexElements)[4] = std::get<8>(args);
1182     const uint32_t (&swizzle)[4] = std::get<9>(args);
1183
1184     // cast types
1185     Type* vGatherTy = mSimdInt32Ty;
1186     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1187
1188     // have to do extra work for sign extending
1189     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1190         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1191         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1192
1193         // shuffle mask, including any swizzling
1194         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1195         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1196         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1197                     char(y), char(y+4), char(y+8), char(y+12),
1198                     char(z), char(z+4), char(z+8), char(z+12),
1199                     char(w), char(w+4), char(w+8), char(w+12),
1200                     char(x), char(x+4), char(x+8), char(x+12),
1201                     char(y), char(y+4), char(y+8), char(y+12),
1202                     char(z), char(z+4), char(z+8), char(z+12),
1203                     char(w), char(w+4), char(w+8), char(w+12)});
1204
1205         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1206         // after pshufb: group components together in each 128bit lane
1207         // 256i - 0    1    2    3    4    5    6    7
1208         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1209
1210         Value* vi128XY = nullptr;
1211         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1212             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1213             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1214             // 256i - 0    1    2    3    4    5    6    7
1215             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1216         }
1217
1218         // do the same for zw components
1219         Value* vi128ZW = nullptr;
1220         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1221             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1222         }
1223
1224         // init denormalize variables if needed
1225         Instruction::CastOps fpCast;
1226         Value* conversionFactor;
1227
1228         switch (conversionType)
1229         {
1230         case CONVERT_NORMALIZED:
1231             fpCast = Instruction::CastOps::SIToFP;
1232             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1233             break;
1234         case CONVERT_SSCALED:
1235             fpCast = Instruction::CastOps::SIToFP;
1236             conversionFactor = VIMMED1((float)(1.0));
1237             break;
1238         case CONVERT_USCALED:
1239             SWR_ASSERT(0, "Type should not be sign extended!");
1240             conversionFactor = nullptr;
1241             break;
1242         default:
1243             SWR_ASSERT(conversionType == CONVERT_NONE);
1244             conversionFactor = nullptr;
1245             break;
1246         }
1247
1248         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1249         for (uint32_t i = 0; i < 4; i++)
1250         {
1251             if (isComponentEnabled(compMask, i))
1252             {
1253                 if (compCtrl[i] == ComponentControl::StoreSrc)
1254                 {
1255                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1256                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1257                     // if x or y, use vi128XY permute result, else use vi128ZW
1258                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1259
1260                     // sign extend
1261                     vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1262
1263                     // denormalize if needed
1264                     if (conversionType != CONVERT_NONE)
1265                     {
1266                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1267                     }
1268                     currentVertexElement++;
1269                 }
1270                 else
1271                 {
1272                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1273                 }
1274
1275                 if (currentVertexElement > 3)
1276                 {
1277                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1278                     // reset to the next vVertexElement to output
1279                     currentVertexElement = 0;
1280                 }
1281             }
1282         }
1283     }
1284     // else zero extend
1285     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1286     {
1287         // init denormalize variables if needed
1288         Instruction::CastOps fpCast;
1289         Value* conversionFactor;
1290
1291         switch (conversionType)
1292         {
1293         case CONVERT_NORMALIZED:
1294             fpCast = Instruction::CastOps::UIToFP;
1295             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1296             break;
1297         case CONVERT_USCALED:
1298             fpCast = Instruction::CastOps::UIToFP;
1299             conversionFactor = VIMMED1((float)(1.0));
1300             break;
1301         case CONVERT_SSCALED:
1302             SWR_ASSERT(0, "Type should not be zero extended!");
1303             conversionFactor = nullptr;
1304             break;
1305         default:
1306             SWR_ASSERT(conversionType == CONVERT_NONE);
1307             conversionFactor = nullptr;
1308             break;
1309         }
1310
1311         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1312         for (uint32_t i = 0; i < 4; i++)
1313         {
1314             if (isComponentEnabled(compMask, i))
1315             {
1316                 if (compCtrl[i] == ComponentControl::StoreSrc)
1317                 {
1318                     // pshufb masks for each component
1319                     Value* vConstMask;
1320                     switch (swizzle[i])
1321                     {
1322                     case 0:
1323                         // x shuffle mask
1324                         vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1325                                                0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1326                         break;
1327                     case 1:
1328                         // y shuffle mask
1329                         vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1330                                                1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1331                         break;
1332                     case 2:
1333                         // z shuffle mask
1334                         vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1335                                                2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1336                         break;
1337                     case 3:
1338                         // w shuffle mask
1339                         vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1340                                                3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1341                         break;
1342                     default:
1343                         vConstMask = nullptr;
1344                         break;
1345                     }
1346
1347                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1348                     // after pshufb for x channel
1349                     // 256i - 0    1    2    3    4    5    6    7
1350                     //        x000 x000 x000 x000 x000 x000 x000 x000
1351
1352                     // denormalize if needed
1353                     if (conversionType != CONVERT_NONE)
1354                     {
1355                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1356                     }
1357                     currentVertexElement++;
1358                 }
1359                 else
1360                 {
1361                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1362                 }
1363
1364                 if (currentVertexElement > 3)
1365                 {
1366                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1367                     // reset to the next vVertexElement to output
1368                     currentVertexElement = 0;
1369                 }
1370             }
1371         }
1372     }
1373     else
1374     {
1375         SWR_ASSERT(0, "Unsupported conversion type");
1376     }
1377 }
1378
1379 //////////////////////////////////////////////////////////////////////////
1380 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1381 /// denormalizes if needed, converts to F32 if needed, and positions in
1382 //  the proper SIMD rows to be output to the simdvertex structure
1383 /// @param args: (tuple of args, listed below)
1384 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1385 ///   @param pVtxOut - base pointer to output simdvertex struct
1386 ///   @param extendType - sign extend or zero extend
1387 ///   @param bNormalized - do we need to denormalize?
1388 ///   @param currentVertexElement - reference to the current vVertexElement
1389 ///   @param outputElt - reference to the current offset from simdvertex we're o
1390 ///   @param compMask - component packing mask
1391 ///   @param compCtrl - component control val
1392 ///   @param vVertexElements[4] - vertex components to output
1393 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1394 {
1395     // Unpack tuple args
1396     Value* (&vGatherResult)[2] = std::get<0>(args);
1397     Value* pVtxOut = std::get<1>(args);
1398     const Instruction::CastOps extendType = std::get<2>(args);
1399     const ConversionType conversionType = std::get<3>(args);
1400     uint32_t &currentVertexElement = std::get<4>(args);
1401     uint32_t &outputElt = std::get<5>(args);
1402     const ComponentEnable compMask = std::get<6>(args);
1403     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1404     Value* (&vVertexElements)[4] = std::get<8>(args);
1405
1406     // cast types
1407     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1408     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1409
1410     // have to do extra work for sign extending
1411     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1412         (extendType == Instruction::CastOps::FPExt))
1413     {
1414         // is this PP float?
1415         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1416
1417         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1418         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1419
1420         // shuffle mask
1421         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1422                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1423         Value* vi128XY = nullptr;
1424         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1425             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1426             // after pshufb: group components together in each 128bit lane
1427             // 256i - 0    1    2    3    4    5    6    7
1428             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1429
1430             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1431             // after PERMD: move and pack xy components into each 128bit lane
1432             // 256i - 0    1    2    3    4    5    6    7
1433             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1434         }
1435
1436         // do the same for zw components
1437         Value* vi128ZW = nullptr;
1438         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1439             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1440             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1441         }
1442
1443         // init denormalize variables if needed
1444         Instruction::CastOps IntToFpCast;
1445         Value* conversionFactor;
1446
1447         switch (conversionType)
1448         {
1449         case CONVERT_NORMALIZED:
1450             IntToFpCast = Instruction::CastOps::SIToFP;
1451             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1452             break;
1453         case CONVERT_SSCALED:
1454             IntToFpCast = Instruction::CastOps::SIToFP;
1455             conversionFactor = VIMMED1((float)(1.0));
1456             break;
1457         case CONVERT_USCALED:
1458             SWR_ASSERT(0, "Type should not be sign extended!");
1459             conversionFactor = nullptr;
1460             break;
1461         default:
1462             SWR_ASSERT(conversionType == CONVERT_NONE);
1463             conversionFactor = nullptr;
1464             break;
1465         }
1466
1467         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1468         for (uint32_t i = 0; i < 4; i++)
1469         {
1470             if (isComponentEnabled(compMask, i))
1471             {
1472                 if (compCtrl[i] == ComponentControl::StoreSrc)
1473                 {
1474                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1475                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1476                     // if x or y, use vi128XY permute result, else use vi128ZW
1477                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1478
1479                     if (bFP) {
1480                         // extract 128 bit lanes to sign extend each component
1481                         vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1482                     }
1483                     else {
1484                         // extract 128 bit lanes to sign extend each component
1485                         vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1486
1487                         // denormalize if needed
1488                         if (conversionType != CONVERT_NONE) {
1489                             vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1490                         }
1491                     }
1492                     currentVertexElement++;
1493                 }
1494                 else
1495                 {
1496                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1497                 }
1498
1499                 if (currentVertexElement > 3)
1500                 {
1501                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1502                     // reset to the next vVertexElement to output
1503                     currentVertexElement = 0;
1504                 }
1505             }
1506         }
1507     }
1508     // else zero extend
1509     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1510     {
1511         // pshufb masks for each component
1512         Value* vConstMask[2];
1513         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1514             // x/z shuffle mask
1515             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1516                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1517         }
1518
1519         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1520             // y/w shuffle mask
1521             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1522                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1523         }
1524
1525         // init denormalize variables if needed
1526         Instruction::CastOps fpCast;
1527         Value* conversionFactor;
1528
1529         switch (conversionType)
1530         {
1531         case CONVERT_NORMALIZED:
1532             fpCast = Instruction::CastOps::UIToFP;
1533             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1534             break;
1535         case CONVERT_USCALED:
1536             fpCast = Instruction::CastOps::UIToFP;
1537             conversionFactor = VIMMED1((float)(1.0f));
1538             break;
1539         case CONVERT_SSCALED:
1540             SWR_ASSERT(0, "Type should not be zero extended!");
1541             conversionFactor = nullptr;
1542             break;
1543         default:
1544             SWR_ASSERT(conversionType == CONVERT_NONE);
1545             conversionFactor = nullptr;
1546             break;
1547         }
1548
1549         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1550         for (uint32_t i = 0; i < 4; i++)
1551         {
1552             if (isComponentEnabled(compMask, i))
1553             {
1554                 if (compCtrl[i] == ComponentControl::StoreSrc)
1555                 {
1556                     // select correct constMask for x/z or y/w pshufb
1557                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1558                     // if x or y, use vi128XY permute result, else use vi128ZW
1559                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1560
1561                     vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1562                     // after pshufb mask for x channel; z uses the same shuffle from the second gather
1563                     // 256i - 0    1    2    3    4    5    6    7
1564                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1565
1566                     // denormalize if needed
1567                     if (conversionType != CONVERT_NONE)
1568                     {
1569                         vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1570                     }
1571                     currentVertexElement++;
1572                 }
1573                 else
1574                 {
1575                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1576                 }
1577
1578                 if (currentVertexElement > 3)
1579                 {
1580                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1581                     // reset to the next vVertexElement to output
1582                     currentVertexElement = 0;
1583                 }
1584             }
1585         }
1586     }
1587     else
1588     {
1589         SWR_ASSERT(0, "Unsupported conversion type");
1590     }
1591 }
1592
1593 //////////////////////////////////////////////////////////////////////////
1594 /// @brief Output a simdvertex worth of elements to the current outputElt
1595 /// @param pVtxOut - base address of VIN output struct
1596 /// @param outputElt - simdvertex offset in VIN to write to
1597 /// @param numEltsToStore - number of simdvertex rows to write out
1598 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1599 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1600 {
1601     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1602
1603     for(uint32_t c = 0; c < numEltsToStore; ++c)
1604     {
1605         // STORE expects FP32 x vWidth type, just bitcast if needed
1606         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1607 #if FETCH_DUMP_VERTEX
1608             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1609 #endif
1610             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1611         }
1612 #if FETCH_DUMP_VERTEX
1613         else
1614         {
1615             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1616         }
1617 #endif
1618         // outputElt * 4 = offsetting by the size of a simdvertex
1619         // + c offsets to a 32bit x vWidth row within the current vertex
1620         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1621         STORE(vVertexElements[c], dest);
1622     }
1623 }
1624
1625 //////////////////////////////////////////////////////////////////////////
1626 /// @brief Generates a constant vector of values based on the
1627 /// ComponentControl value
1628 /// @param ctrl - ComponentControl value
1629 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1630 {
1631     switch(ctrl)
1632     {
1633         case NoStore:   return VUNDEF_I();
1634         case Store0:    return VIMMED1(0);
1635         case Store1Fp:  return VIMMED1(1.0f);
1636         case Store1Int: return VIMMED1(1);
1637         case StoreVertexId:
1638         {
1639             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1640             return VBROADCAST(pId);
1641         }
1642         case StoreInstanceId:
1643         {
1644             Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1645             return VBROADCAST(pId);
1646         }
1647         case StoreSrc:
1648         default:        SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1649     }
1650 }
1651
1652 //////////////////////////////////////////////////////////////////////////
1653 /// @brief Returns the enable mask for the specified component.
1654 /// @param enableMask - enable bits
1655 /// @param component - component to check if enabled.
1656 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1657 {
1658     switch (component)
1659     {
1660         // X
1661     case 0: return (enableMask & ComponentEnable::X);
1662         // Y
1663     case 1: return (enableMask & ComponentEnable::Y);
1664         // Z
1665     case 2: return (enableMask & ComponentEnable::Z);
1666         // W
1667     case 3: return (enableMask & ComponentEnable::W);
1668
1669     default: return false;
1670     }
1671 }
1672
1673
1674 //////////////////////////////////////////////////////////////////////////
1675 /// @brief JITs from fetch shader IR
1676 /// @param hJitMgr - JitManager handle
1677 /// @param func   - LLVM function IR
1678 /// @return PFN_FETCH_FUNC - pointer to fetch code
1679 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1680 {
1681     const llvm::Function* func = (const llvm::Function*)hFunc;
1682     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1683     PFN_FETCH_FUNC pfnFetch;
1684
1685     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1686     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1687     pJitMgr->mIsModuleFinalized = true;
1688
1689 #if defined(KNOB_SWRC_TRACING)
1690     char fName[1024];
1691     const char *funcName = func->getName().data();
1692     sprintf(fName, "%s.bin", funcName);
1693     FILE *fd = fopen(fName, "wb");
1694     fwrite((void *)pfnFetch, 1, 2048, fd);
1695     fclose(fd);
1696 #endif
1697
1698     return pfnFetch;
1699 }
1700
1701 //////////////////////////////////////////////////////////////////////////
1702 /// @brief JIT compiles fetch shader
1703 /// @param hJitMgr - JitManager handle
1704 /// @param state   - fetch state to build function from
1705 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1706 {
1707     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1708
1709     pJitMgr->SetupNewModule();
1710
1711     FetchJit theJit(pJitMgr);
1712     HANDLE hFunc = theJit.Create(state);
1713
1714     return JitFetchFunc(hJitMgr, hFunc);
1715 }