src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file fetch_jit.cpp
  24 *
  25 * @brief Implementation of the fetch jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_api.h"
  31 #include "fetch_jit.h"
  32 #include "builder.h"
  33 #include "state_llvm.h"
  34 #include "common/containers.hpp"
  35 #include "llvm/IR/DataLayout.h"
  36 #include <sstream>
  37 #include <tuple>
  38
  39 //#define FETCH_DUMP_VERTEX 1
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49 };
  50
  51 //////////////////////////////////////////////////////////////////////////
  52 /// Interface to Jitting a fetch shader
  53 //////////////////////////////////////////////////////////////////////////
  54 struct FetchJit : public Builder
  55 {
  56     FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  57
  58     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  59     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  60     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  61     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  62
  63     // package up Shuffle*bpcGatherd args into a tuple for convenience
  64     typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
  65                        uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
  66                        const uint32_t (&)[4]> Shuffle8bpcArgs;
  67     void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
  68
  69     typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
  70                        uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
  71     void Shuffle16bpcGather(Shuffle16bpcArgs &args);
  72
  73     void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
  74
  75     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
  76
  77     void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
  78     void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
  79
  80     bool IsOddFormat(SWR_FORMAT format);
  81     bool IsUniformFormat(SWR_FORMAT format);
  82     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
  83     void CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4]);
  84     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
  85
  86 };
  87
  88 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
  89 {
  90     static std::size_t fetchNum = 0;
  91
  92     std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
  93     fnName << fetchNum++;
  94
  95     Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
  96     BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
  97
  98     IRB()->SetInsertPoint(entry);
  99
 100     auto    argitr = fetch->getArgumentList().begin();
 101
 102     // Fetch shader arguments
 103     Value*    fetchInfo = &*argitr; ++argitr;
 104     fetchInfo->setName("fetchInfo");
 105     Value*    pVtxOut = &*argitr;
 106     pVtxOut->setName("vtxOutput");
 107     // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
 108     // index 0(just the pointer to the simdvertex structure
 109     // index 1(which element of the simdvertex structure to offset to(in this case 0)
 110     // so the indices being i32's doesn't matter
 111     // TODO: generated this GEP with a VECTOR structure type so this makes sense
 112     std::vector<Value*>    vtxInputIndices(2, C(0));
 113     // GEP
 114     pVtxOut = GEP(pVtxOut, C(0));
 115     pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 116
 117     // SWR_FETCH_CONTEXT::pStreams
 118     Value*    streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
 119     streams->setName("pStreams");
 120
 121     // SWR_FETCH_CONTEXT::pIndices
 122     Value*    indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
 123     indices->setName("pIndices");
 124
 125     // SWR_FETCH_CONTEXT::pLastIndex
 126     Value*    pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
 127     pLastIndex->setName("pLastIndex");
 128
 129
 130     Value* vIndices;
 131     switch(fetchState.indexType)
 132     {
 133         case R8_UINT:
 134             indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 135             if(fetchState.bDisableIndexOOBCheck){
 136                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 137                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 138             }
 139             else{
 140                 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
 141                 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 142             }
 143             break;
 144         case R16_UINT:
 145             indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
 146             if(fetchState.bDisableIndexOOBCheck){
 147                 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
 148                 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 149             }
 150             else{
 151                 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
 152                 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 153             }
 154             break;
 155         case R32_UINT:
 156             (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
 157                                                : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 158             break; // incoming type is already 32bit int
 159         default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
 160     }
 161
 162     // store out vertex IDs
 163     STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
 164
 165     // store out cut mask if enabled
 166     if (fetchState.bEnableCutIndex)
 167     {
 168         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 169         Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
 170         STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
 171     }
 172
 173     // Fetch attributes from memory and output to a simdvertex struct
 174     // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
 175     (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut)
 176                                  : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut);
 177
 178     RET_VOID();
 179
 180     JitManager::DumpToFile(fetch, "src");
 181
 182     verifyFunction(*fetch);
 183
 184 #if HAVE_LLVM == 0x306
 185         FunctionPassManager
 186 #else
 187         llvm::legacy::FunctionPassManager
 188 #endif
 189             setupPasses(JM()->mpCurrentModule);
 190
 191     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 192     setupPasses.add(createBreakCriticalEdgesPass());
 193     setupPasses.add(createCFGSimplificationPass());
 194     setupPasses.add(createEarlyCSEPass());
 195     setupPasses.add(createPromoteMemoryToRegisterPass());
 196
 197     setupPasses.run(*fetch);
 198
 199     JitManager::DumpToFile(fetch, "se");
 200
 201 #if HAVE_LLVM == 0x306
 202         FunctionPassManager
 203 #else
 204         llvm::legacy::FunctionPassManager
 205 #endif
 206             optPasses(JM()->mpCurrentModule);
 207
 208     ///@todo Haven't touched these either. Need to remove some of these and add others.
 209     optPasses.add(createCFGSimplificationPass());
 210     optPasses.add(createEarlyCSEPass());
 211     optPasses.add(createInstructionCombiningPass());
 212     optPasses.add(createInstructionSimplifierPass());
 213     optPasses.add(createConstantPropagationPass());
 214     optPasses.add(createSCCPPass());
 215     optPasses.add(createAggressiveDCEPass());
 216
 217     optPasses.run(*fetch);
 218     optPasses.run(*fetch);
 219
 220     JitManager::DumpToFile(fetch, "opt");
 221
 222     return fetch;
 223 }
 224
 225 //////////////////////////////////////////////////////////////////////////
 226 /// @brief Loads attributes from memory using LOADs, shuffling the
 227 /// components into SOA form.
 228 /// *Note* currently does not support component control,
 229 /// component packing, or instancing
 230 /// @param fetchState - info about attributes to be fetched from memory
 231 /// @param streams - value pointer to the current vertex stream
 232 /// @param vIndices - vector value of indices to load
 233 /// @param pVtxOut - value pointer to output simdvertex struct
 234 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut)
 235 {
 236     // Zack shuffles; a variant of the Charleston.
 237
 238     SWRL::UncheckedFixedVector<Value*, 16>    vectors;
 239
 240     std::vector<Constant*>    pMask(mVWidth);
 241     for(uint32_t i = 0; i < mVWidth; ++i)
 242     {
 243         pMask[i] = (C(i < 4 ? i : 4));
 244     }
 245     Constant* promoteMask = ConstantVector::get(pMask);
 246     Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
 247
 248     Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 249
 250     for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
 251     {
 252         Value*    elements[4] = {0};
 253         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
 254         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 255         uint32_t    numComponents = info.numComps;
 256         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 257
 258         vectors.clear();
 259
 260         // load SWR_VERTEX_BUFFER_STATE::pData
 261         Value *stream = LOAD(streams, {ied.StreamIndex, 2});
 262
 263         // load SWR_VERTEX_BUFFER_STATE::pitch
 264         Value *stride = LOAD(streams, {ied.StreamIndex, 1});
 265         stride = Z_EXT(stride, mInt64Ty);
 266
 267         // load SWR_VERTEX_BUFFER_STATE::size
 268         Value *size = LOAD(streams, {ied.StreamIndex, 3});
 269         size = Z_EXT(size, mInt64Ty);
 270
 271         Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
 272
 273         // Load from the stream.
 274         for(uint32_t lane = 0; lane < mVWidth; ++lane)
 275         {
 276             // Get index
 277             Value* index = VEXTRACT(vIndices, C(lane));
 278             index = Z_EXT(index, mInt64Ty);
 279
 280             Value*    offset = MUL(index, stride);
 281             offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
 282             offset = ADD(offset, startVertexOffset);
 283
 284             if (!fetchState.bDisableIndexOOBCheck) {
 285                 // check for out of bound access, including partial OOB, and mask them to 0
 286                 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
 287                 Value *oob = ICMP_ULE(endOffset, size);
 288                 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
 289             }
 290
 291             Value*    pointer = GEP(stream, offset);
 292             // We use a full-lane, but don't actually care.
 293             Value*    vptr = 0;
 294
 295             // get a pointer to a 4 component attrib in default address space
 296             switch(bpc)
 297             {
 298                 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
 299                 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
 300                 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
 301                 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
 302             }
 303
 304             // load 4 components of attribute
 305             Value*    vec = ALIGNED_LOAD(vptr, 1, false);
 306
 307             // Convert To FP32 internally
 308             switch(info.type[0])
 309             {
 310                 case SWR_TYPE_UNORM:
 311                     switch(bpc)
 312                     {
 313                         case 8:
 314                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 315                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
 316                             break;
 317                         case 16:
 318                             vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 319                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
 320                             break;
 321                         default:
 322                             SWR_ASSERT(false, "Unsupported underlying type!");
 323                             break;
 324                     }
 325                     break;
 326                 case SWR_TYPE_SNORM:
 327                     switch(bpc)
 328                     {
 329                         case 8:
 330                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 331                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
 332                             break;
 333                         case 16:
 334                             vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 335                             vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
 336                             break;
 337                         default:
 338                             SWR_ASSERT(false, "Unsupported underlying type!");
 339                             break;
 340                     }
 341                     break;
 342                 case SWR_TYPE_UINT:
 343                     // Zero extend uint32_t types.
 344                     switch(bpc)
 345                     {
 346                         case 8:
 347                         case 16:
 348                             vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
 349                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 350                             break;
 351                         case 32:
 352                             break; // Pass through unchanged.
 353                         default:
 354                             SWR_ASSERT(false, "Unsupported underlying type!");
 355                             break;
 356                     }
 357                     break;
 358                 case SWR_TYPE_SINT:
 359                     // Sign extend SINT types.
 360                     switch(bpc)
 361                     {
 362                         case 8:
 363                         case 16:
 364                             vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
 365                             vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
 366                             break;
 367                         case 32:
 368                             break; // Pass through unchanged.
 369                         default:
 370                             SWR_ASSERT(false, "Unsupported underlying type!");
 371                             break;
 372                     }
 373                     break;
 374                 case SWR_TYPE_FLOAT:
 375                     switch(bpc)
 376                     {
 377                         case 32:
 378                             break; // Pass through unchanged.
 379                         default:
 380                             SWR_ASSERT(false, "Unsupported underlying type!");
 381                     }
 382                     break;
 383                 case SWR_TYPE_USCALED:
 384                     vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 385                     break;
 386                 case SWR_TYPE_SSCALED:
 387                     vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
 388                     break;
 389                 case SWR_TYPE_UNKNOWN:
 390                 case SWR_TYPE_UNUSED:
 391                     SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
 392             }
 393
 394             // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
 395             // uwvec: 4 x F32, undef value
 396             Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
 397             vectors.push_back(wvec);
 398         }
 399
 400         std::vector<Constant*>        v01Mask(mVWidth);
 401         std::vector<Constant*>        v23Mask(mVWidth);
 402         std::vector<Constant*>        v02Mask(mVWidth);
 403         std::vector<Constant*>        v13Mask(mVWidth);
 404
 405         // Concatenate the vectors together.
 406         elements[0] = VUNDEF_F();
 407         elements[1] = VUNDEF_F();
 408         elements[2] = VUNDEF_F();
 409         elements[3] = VUNDEF_F();
 410         for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
 411         {
 412             v01Mask[4 * b + 0] = C(0 + 4 * b);
 413             v01Mask[4 * b + 1] = C(1 + 4 * b);
 414             v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 415             v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 416
 417             v23Mask[4 * b + 0] = C(2 + 4 * b);
 418             v23Mask[4 * b + 1] = C(3 + 4 * b);
 419             v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
 420             v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 421
 422             v02Mask[4 * b + 0] = C(0 + 4 * b);
 423             v02Mask[4 * b + 1] = C(2 + 4 * b);
 424             v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
 425             v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 426
 427             v13Mask[4 * b + 0] = C(1 + 4 * b);
 428             v13Mask[4 * b + 1] = C(3 + 4 * b);
 429             v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
 430             v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 431
 432             std::vector<Constant*>    iMask(mVWidth);
 433             for(uint32_t i = 0; i < mVWidth; ++i)
 434             {
 435                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
 436                 {
 437                     iMask[i] = C(i % 4 + mVWidth);
 438                 }
 439                 else
 440                 {
 441                     iMask[i] = C(i);
 442                 }
 443             }
 444             Constant* insertMask = ConstantVector::get(iMask);
 445             elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
 446             elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
 447             elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
 448             elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
 449         }
 450
 451         Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
 452         Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
 453         Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
 454         Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
 455         elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
 456         elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
 457         elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
 458         elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
 459
 460         switch(numComponents + 1)
 461         {
 462             case    1: elements[0] = VIMMED1(0.0f);
 463             case    2: elements[1] = VIMMED1(0.0f);
 464             case    3: elements[2] = VIMMED1(0.0f);
 465             case    4: elements[3] = VIMMED1(1.0f);
 466         }
 467
 468         for(uint32_t c = 0; c < 4; ++c)
 469         {
 470             Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
 471             STORE(elements[c], dest);
 472         }
 473     }
 474 }
 475
 476 // returns true for odd formats that require special state.gather handling
 477 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 478 {
 479     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 480     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
 481     {
 482         return true;
 483     }
 484     return false;
 485 }
 486
 487 // format is uniform if all components are the same size and type
 488 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 489 {
 490     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 491     uint32_t bpc0 = info.bpc[0];
 492     uint32_t type0 = info.type[0];
 493
 494     for (uint32_t c = 1; c < info.numComps; ++c)
 495     {
 496         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 497         {
 498             return false;
 499         }
 500     }
 501     return true;
 502 }
 503
 504 // unpacks components based on format
 505 // foreach component in the pixel
 506 //   mask off everything but this component
 507 //   shift component to LSB
 508 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 509 {
 510     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 511
 512     uint32_t bitOffset = 0;
 513     for (uint32_t c = 0; c < info.numComps; ++c)
 514     {
 515         uint32_t swizzledIndex = info.swizzle[c];
 516         uint32_t compBits = info.bpc[c];
 517         uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
 518         Value* comp = AND(vInput, bitmask);
 519         comp = LSHR(comp, bitOffset);
 520
 521         result[swizzledIndex] = comp;
 522         bitOffset += compBits;
 523     }
 524 }
 525
 526 // gather for odd component size formats
 527 // gather SIMD full pixels per lane then shift/mask to move each component to their
 528 // own vector
 529 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4])
 530 {
 531     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 532
 533     // only works if pixel size is <= 32bits
 534     SWR_ASSERT(info.bpp <= 32);
 535
 536     Value* gather = VUNDEF_I();
 537
 538     // assign defaults
 539     for (uint32_t comp = 0; comp < 4; ++comp)
 540     {
 541         result[comp] = VIMMED1((int)info.defaults[comp]);
 542     }
 543
 544     // gather SIMD pixels
 545     for (uint32_t e = 0; e < JM()->mVWidth; ++e)
 546     {
 547         Value* elemOffset = VEXTRACT(offsets, C(e));
 548         Value* load = GEP(pBase, elemOffset);
 549
 550         // load the proper amount of data based on component size
 551         switch (info.bpp)
 552         {
 553         case 8: load = POINTER_CAST(load, Type::getInt8PtrTy(JM()->mContext)); break;
 554         case 16: load = POINTER_CAST(load, Type::getInt16PtrTy(JM()->mContext)); break;
 555         case 32: load = POINTER_CAST(load, Type::getInt32PtrTy(JM()->mContext)); break;
 556         default: SWR_ASSERT(0);
 557         }
 558
 559         // load pixel
 560         Value *val = LOAD(load);
 561
 562         // zero extend to 32bit integer
 563         val = INT_CAST(val, mInt32Ty, false);
 564
 565         // store in simd lane
 566         gather = VINSERT(gather, val, C(e));
 567     }
 568
 569     UnpackComponents(format, gather, result);
 570
 571     // cast to fp32
 572     result[0] = BITCAST(result[0], mSimdFP32Ty);
 573     result[1] = BITCAST(result[1], mSimdFP32Ty);
 574     result[2] = BITCAST(result[2], mSimdFP32Ty);
 575     result[3] = BITCAST(result[3], mSimdFP32Ty);
 576 }
 577
 578 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
 579 {
 580     const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 581
 582     for (uint32_t c = 0; c < info.numComps; ++c)
 583     {
 584         uint32_t compIndex = info.swizzle[c];
 585
 586         // skip any conversion on UNUSED components
 587         if (info.type[c] == SWR_TYPE_UNUSED)
 588         {
 589             continue;
 590         }
 591
 592         if (info.isNormalized[c])
 593         {
 594             if (info.type[c] == SWR_TYPE_SNORM)
 595             {
 596                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
 597
 598                 /// result = c * (1.0f / (2^(n-1) - 1);
 599                 uint32_t n = info.bpc[c];
 600                 uint32_t pow2 = 1 << (n - 1);
 601                 float scale = 1.0f / (float)(pow2 - 1);
 602                 Value *vScale = VIMMED1(scale);
 603                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 604                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 605                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 606             }
 607             else
 608             {
 609                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 610
 611                 /// result = c * (1.0f / (2^n - 1))
 612                 uint32_t n = info.bpc[c];
 613                 uint32_t pow2 = 1 << n;
 614                 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
 615                 if (n == 24)
 616                 {
 617                     float scale = (float)(pow2 - 1);
 618                     Value* vScale = VIMMED1(scale);
 619                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 620                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 621                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 622                 }
 623                 else
 624                 {
 625                     float scale = 1.0f / (float)(pow2 - 1);
 626                     Value *vScale = VIMMED1(scale);
 627                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 628                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 629                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 630                 }
 631             }
 632             continue;
 633         }
 634     }
 635 }
 636
 637 //////////////////////////////////////////////////////////////////////////
 638 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 639 /// @param fetchState - info about attributes to be fetched from memory
 640 /// @param fetchInfo - first argument passed to fetch shader
 641 /// @param streams - value pointer to the current vertex stream
 642 /// @param vIndices - vector value of indices to gather
 643 /// @param pVtxOut - value pointer to output simdvertex struct
 644 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo,
 645                                  Value* streams, Value* vIndices, Value* pVtxOut)
 646 {
 647     uint32_t currentVertexElement = 0;
 648     uint32_t outputElt = 0;
 649     Value* vVertexElements[4];
 650
 651     Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 652     Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 653     Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 654     Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 655     curInstance->setName("curInstance");
 656
 657     for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
 658     {
 659         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 660         const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
 661         uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
 662
 663         Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
 664
 665         // VGATHER* takes an *i8 src pointer
 666         Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
 667
 668         Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 669         Value *vStride = VBROADCAST(stride);
 670
 671         // max vertex index that is fully in bounds
 672         Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 673         maxVertex = LOAD(maxVertex);
 674
 675         Value *vCurIndices;
 676         Value *startOffset;
 677         if(ied.InstanceEnable)
 678         {
 679             Value* stepRate = C(ied.InstanceDataStepRate);
 680
 681             // prevent a div by 0 for 0 step rate
 682             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 683             stepRate = SELECT(isNonZeroStep, stepRate, C(1));
 684
 685             // calc the current offset into instanced data buffer
 686             Value* calcInstance = UDIV(curInstance, stepRate);
 687
 688             // if step rate is 0, every instance gets instance 0
 689             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 690
 691             vCurIndices = VBROADCAST(calcInstance);
 692
 693             startOffset = startInstance;
 694         }
 695         else
 696         {
 697             // offset indices by baseVertex
 698             vCurIndices = ADD(vIndices, vBaseVertex);
 699
 700             startOffset = startVertex;
 701         }
 702
 703         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 704         // do 64bit address offset calculations.
 705
 706         // calculate byte offset to the start of the VB
 707         Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 708         pStreamBase = GEP(pStreamBase, baseOffset);
 709
 710         // if we have a start offset, subtract from max vertex. Used for OOB check
 711         maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 712         Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
 713         // if we have a negative value, we're already OOB. clamp at 0.
 714         maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
 715
 716         // Load the in bounds size of a partially valid vertex
 717         Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 718         partialInboundsSize = LOAD(partialInboundsSize);
 719         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 720         Value* vBpp = VBROADCAST(C(info.Bpp));
 721         Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
 722
 723         // is the element is <= the partially valid size
 724         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 725
 726         // are vertices partially OOB?
 727         Value* vMaxVertex = VBROADCAST(maxVertex);
 728         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 729
 730         // are vertices are fully in bounds?
 731         Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 732
 733         // blend in any partially OOB indices that have valid elements
 734         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 735         vGatherMask = VMASK(vGatherMask);
 736
 737         // calculate the actual offsets into the VB
 738         Value* vOffsets = MUL(vCurIndices, vStride);
 739         vOffsets = ADD(vOffsets, vAlignmentOffsets);
 740
 741         // Packing and component control
 742         ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 743         const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
 744                                              (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
 745
 746         // Special gather/conversion for formats without equal component sizes
 747         if (IsOddFormat((SWR_FORMAT)ied.Format))
 748         {
 749             // Only full 4 component fetch is supported for odd formats
 750             SWR_ASSERT(compMask == XYZW);
 751             Value* pResults[4];
 752             CreateGatherOddFormats((SWR_FORMAT)ied.Format, pStreamBase, vOffsets, pResults);
 753             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 754
 755             StoreVertexElements(pVtxOut, outputElt++, 4, pResults);
 756             currentVertexElement = 0;
 757         }
 758         else if(info.type[0] == SWR_TYPE_FLOAT)
 759         {
 760             ///@todo: support 64 bit vb accesses
 761             Value* gatherSrc = VIMMED1(0.0f);
 762
 763             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 764                 "Unsupported format for standard gather fetch.");
 765
 766             // Gather components from memory to store in a simdvertex structure
 767             switch(bpc)
 768             {
 769                 case 16:
 770                 {
 771                     Value* vGatherResult[2];
 772                     Value *vMask;
 773
 774                     // if we have at least one component out of x or y to fetch
 775                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 776                         // save mask as it is zero'd out after each gather
 777                         vMask = vGatherMask;
 778
 779                         vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 780                         // e.g. result of first 8x32bit integer gather for 16bit components
 781                         // 256i - 0    1    2    3    4    5    6    7
 782                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 783                         //
 784                     }
 785
 786                     // if we have at least one component out of z or w to fetch
 787                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 788                         // offset base to the next components(zw) in the vertex to gather
 789                         pStreamBase = GEP(pStreamBase, C((char)4));
 790                         vMask = vGatherMask;
 791
 792                         vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 793                         // e.g. result of second 8x32bit integer gather for 16bit components
 794                         // 256i - 0    1    2    3    4    5    6    7
 795                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 796                         //
 797                     }
 798
 799                     // if we have at least one component to shuffle into place
 800                     if(compMask){
 801                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
 802                                                                       currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 803                         // Shuffle gathered components into place in simdvertex struct
 804                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 805                     }
 806                 }
 807                     break;
 808                 case 32:
 809                 {
 810                     for(uint32_t i = 0; i < 4; i++)
 811                     {
 812                         if(!isComponentEnabled(compMask, i)){
 813                             // offset base to the next component in the vertex to gather
 814                             pStreamBase = GEP(pStreamBase, C((char)4));
 815                             continue;
 816                         }
 817
 818                         // if we need to gather the component
 819                         if(compCtrl[i] == StoreSrc){
 820                             // save mask as it is zero'd out after each gather
 821                             Value *vMask = vGatherMask;
 822
 823                             // Gather a SIMD of vertices
 824                             vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 825                         }
 826                         else{
 827                             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 828                         }
 829
 830                         if(currentVertexElement > 3){
 831                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 832                             // reset to the next vVertexElement to output
 833                             currentVertexElement = 0;
 834                         }
 835
 836                         // offset base to the next component in the vertex to gather
 837                         pStreamBase = GEP(pStreamBase, C((char)4));
 838                     }
 839                 }
 840                     break;
 841                 default:
 842                     SWR_ASSERT(0, "Tried to fetch invalid FP format");
 843                     break;
 844             }
 845         }
 846         else
 847         {
 848             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 849             ConversionType conversionType = CONVERT_NONE;
 850
 851             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 852                 "Unsupported format for standard gather fetch.");
 853
 854             switch(info.type[0])
 855             {
 856                 case SWR_TYPE_UNORM:
 857                     conversionType = CONVERT_NORMALIZED;
 858                 case SWR_TYPE_UINT:
 859                     extendCastType = Instruction::CastOps::ZExt;
 860                     break;
 861                 case SWR_TYPE_SNORM:
 862                     conversionType = CONVERT_NORMALIZED;
 863                 case SWR_TYPE_SINT:
 864                     extendCastType = Instruction::CastOps::SExt;
 865                     break;
 866                 case SWR_TYPE_USCALED:
 867                     conversionType = CONVERT_USCALED;
 868                     extendCastType = Instruction::CastOps::UIToFP;
 869                     break;
 870                 case SWR_TYPE_SSCALED:
 871                     conversionType = CONVERT_SSCALED;
 872                     extendCastType = Instruction::CastOps::SIToFP;
 873                     break;
 874                 default:
 875                     break;
 876             }
 877
 878             // value substituted when component of gather is masked
 879             Value* gatherSrc = VIMMED1(0);
 880
 881             // Gather components from memory to store in a simdvertex structure
 882             switch (bpc)
 883             {
 884                 case 8:
 885                 {
 886                     // if we have at least one component to fetch
 887                     if(compMask){
 888                         Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
 889                         // e.g. result of an 8x32bit integer gather for 8bit components
 890                         // 256i - 0    1    2    3    4    5    6    7
 891                         //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 892
 893                         Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 894                                                                      currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
 895                         // Shuffle gathered components into place in simdvertex struct
 896                         Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 897                     }
 898                 }
 899                 break;
 900                 case 16:
 901                 {
 902                     Value* vGatherResult[2];
 903                     Value *vMask;
 904
 905                     // if we have at least one component out of x or y to fetch
 906                     if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
 907                         // save mask as it is zero'd out after each gather
 908                         vMask = vGatherMask;
 909
 910                         vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 911                         // e.g. result of first 8x32bit integer gather for 16bit components
 912                         // 256i - 0    1    2    3    4    5    6    7
 913                         //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 914                         //
 915                     }
 916
 917                     // if we have at least one component out of z or w to fetch
 918                     if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
 919                         // offset base to the next components(zw) in the vertex to gather
 920                         pStreamBase = GEP(pStreamBase, C((char)4));
 921                         vMask = vGatherMask;
 922
 923                         vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 924                         // e.g. result of second 8x32bit integer gather for 16bit components
 925                         // 256i - 0    1    2    3    4    5    6    7
 926                         //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 927                         //
 928                     }
 929
 930                     // if we have at least one component to shuffle into place
 931                     if(compMask){
 932                         Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
 933                                                                       currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
 934                         // Shuffle gathered components into place in simdvertex struct
 935                         Shuffle16bpcGather(args);  // outputs to vVertexElements ref
 936                     }
 937                 }
 938                 break;
 939                 case 32:
 940                 {
 941                     SWR_ASSERT(conversionType == CONVERT_NONE);
 942
 943                     // Gathered components into place in simdvertex struct
 944                     for(uint32_t i = 0; i < 4; i++)
 945                     {
 946                         if(!isComponentEnabled(compMask, i)){
 947                             // offset base to the next component in the vertex to gather
 948                             pStreamBase = GEP(pStreamBase, C((char)4));
 949                             continue;
 950                         }
 951
 952                         // if we need to gather the component
 953                         if(compCtrl[i] == StoreSrc){
 954                             // save mask as it is zero'd out after each gather
 955                             Value *vMask = vGatherMask;
 956
 957                             vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
 958
 959                             // e.g. result of a single 8x32bit integer gather for 32bit components
 960                             // 256i - 0    1    2    3    4    5    6    7
 961                             //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
 962                         }
 963                         else{
 964                             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
 965                         }
 966
 967                         if(currentVertexElement > 3){
 968                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 969                             // reset to the next vVertexElement to output
 970                             currentVertexElement = 0;
 971                         }
 972
 973                         // offset base to the next component  in the vertex to gather
 974                         pStreamBase = GEP(pStreamBase, C((char)4));
 975                     }
 976                 }
 977                 break;
 978             }
 979         }
 980     }
 981
 982     // if we have a partially filled vVertexElement struct, output it
 983     if(currentVertexElement > 0){
 984         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement+1, vVertexElements);
 985     }
 986 }
 987
 988 //////////////////////////////////////////////////////////////////////////
 989 /// @brief Loads a simd of valid indices. OOB indices are set to 0
 990 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
 991 /// support
 992 /// @param pIndices - pointer to 8 bit indices
 993 /// @param pLastIndex - pointer to last valid index
 994 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
 995 {
 996     // can fit 2 16 bit integers per vWidth lane
 997     Value* vIndices =  VUNDEF_I();
 998
 999     // store 0 index on stack to be used to conditionally load from if index address is OOB
1000     Value* pZeroIndex = ALLOCA(mInt8Ty);
1001     STORE(C((uint8_t)0), pZeroIndex);
1002
1003     // Load a SIMD of index pointers
1004     for(int64_t lane = 0; lane < mVWidth; lane++)
1005     {
1006         // Calculate the address of the requested index
1007         Value *pIndex = GEP(pIndices, C(lane));
1008
1009         // check if the address is less than the max index,
1010         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1011
1012         // if valid, load the index. if not, load 0 from the stack
1013         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1014         Value *index = LOAD(pValid, "valid index");
1015
1016         // zero extended index to 32 bits and insert into the correct simd lane
1017         index = Z_EXT(index, mInt32Ty);
1018         vIndices = VINSERT(vIndices, index, lane);
1019     }
1020     return vIndices;
1021 }
1022
1023 //////////////////////////////////////////////////////////////////////////
1024 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1025 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1026 /// support
1027 /// @param pIndices - pointer to 16 bit indices
1028 /// @param pLastIndex - pointer to last valid index
1029 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1030 {
1031     // can fit 2 16 bit integers per vWidth lane
1032     Value* vIndices =  VUNDEF_I();
1033
1034     // store 0 index on stack to be used to conditionally load from if index address is OOB
1035     Value* pZeroIndex = ALLOCA(mInt16Ty);
1036     STORE(C((uint16_t)0), pZeroIndex);
1037
1038     // Load a SIMD of index pointers
1039     for(int64_t lane = 0; lane < mVWidth; lane++)
1040     {
1041         // Calculate the address of the requested index
1042         Value *pIndex = GEP(pIndices, C(lane));
1043
1044         // check if the address is less than the max index,
1045         Value* mask = ICMP_ULT(pIndex, pLastIndex);
1046
1047         // if valid, load the index. if not, load 0 from the stack
1048         Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1049         Value *index = LOAD(pValid, "valid index");
1050
1051         // zero extended index to 32 bits and insert into the correct simd lane
1052         index = Z_EXT(index, mInt32Ty);
1053         vIndices = VINSERT(vIndices, index, lane);
1054     }
1055     return vIndices;
1056 }
1057
1058 //////////////////////////////////////////////////////////////////////////
1059 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1060 /// @param pIndices - pointer to 32 bit indices
1061 /// @param pLastIndex - pointer to last valid index
1062 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1063 {
1064     DataLayout dL(JM()->mpCurrentModule);
1065     unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
1066     Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1067     Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1068
1069     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1070     Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1071     numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1072     numIndicesLeft = SDIV(numIndicesLeft, C(4));
1073
1074     // create a vector of index counts from the base index ptr passed into the fetch
1075     const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1076     Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1077
1078     // compare index count to the max valid index
1079     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1080     //     vIndexOffsets  0 1 2 3 4 5 6 7
1081     //     ------------------------------
1082     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1083     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1084     Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1085     Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1086
1087     // VMASKLOAD takes an *i8 src pointer
1088     pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1089
1090     // Load the indices; OOB loads 0
1091     return MASKLOADD(pIndices,vIndexMask);
1092 }
1093
1094 //////////////////////////////////////////////////////////////////////////
1095 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1096 /// denormalizes if needed, converts to F32 if needed, and positions in
1097 //  the proper SIMD rows to be output to the simdvertex structure
1098 /// @param args: (tuple of args, listed below)
1099 ///   @param vGatherResult - 8 gathered 8bpc vertices
1100 ///   @param pVtxOut - base pointer to output simdvertex struct
1101 ///   @param extendType - sign extend or zero extend
1102 ///   @param bNormalized - do we need to denormalize?
1103 ///   @param currentVertexElement - reference to the current vVertexElement
1104 ///   @param outputElt - reference to the current offset from simdvertex we're o
1105 ///   @param compMask - component packing mask
1106 ///   @param compCtrl - component control val
1107 ///   @param vVertexElements[4] - vertex components to output
1108 ///   @param swizzle[4] - component swizzle location
1109 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1110 {
1111     // Unpack tuple args
1112     Value*& vGatherResult = std::get<0>(args);
1113     Value* pVtxOut = std::get<1>(args);
1114     const Instruction::CastOps extendType = std::get<2>(args);
1115     const ConversionType conversionType = std::get<3>(args);
1116     uint32_t &currentVertexElement = std::get<4>(args);
1117     uint32_t &outputElt =  std::get<5>(args);
1118     const ComponentEnable compMask = std::get<6>(args);
1119     const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1120     Value* (&vVertexElements)[4] = std::get<8>(args);
1121     const uint32_t (&swizzle)[4] = std::get<9>(args);
1122
1123     // cast types
1124     Type* vGatherTy = mSimdInt32Ty;
1125     Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1126
1127     // have to do extra work for sign extending
1128     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1129         Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1130         Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1131
1132         // shuffle mask, including any swizzling
1133         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1134         const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1135         Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1136                     char(y), char(y+4), char(y+8), char(y+12),
1137                     char(z), char(z+4), char(z+8), char(z+12),
1138                     char(w), char(w+4), char(w+8), char(w+12),
1139                     char(x), char(x+4), char(x+8), char(x+12),
1140                     char(y), char(y+4), char(y+8), char(y+12),
1141                     char(z), char(z+4), char(z+8), char(z+12),
1142                     char(w), char(w+4), char(w+8), char(w+12)});
1143
1144         Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1145         // after pshufb: group components together in each 128bit lane
1146         // 256i - 0    1    2    3    4    5    6    7
1147         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1148
1149         Value* vi128XY = nullptr;
1150         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1151             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1152             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1153             // 256i - 0    1    2    3    4    5    6    7
1154             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1155         }
1156
1157         // do the same for zw components
1158         Value* vi128ZW = nullptr;
1159         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1160             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1161         }
1162
1163         // init denormalize variables if needed
1164         Instruction::CastOps fpCast;
1165         Value* conversionFactor;
1166
1167         switch (conversionType)
1168         {
1169         case CONVERT_NORMALIZED:
1170             fpCast = Instruction::CastOps::SIToFP;
1171             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1172             break;
1173         case CONVERT_SSCALED:
1174             fpCast = Instruction::CastOps::SIToFP;
1175             conversionFactor = VIMMED1((float)(1.0));
1176             break;
1177         case CONVERT_USCALED:
1178             SWR_ASSERT(0, "Type should not be sign extended!");
1179             conversionFactor = nullptr;
1180             break;
1181         default:
1182             SWR_ASSERT(conversionType == CONVERT_NONE);
1183             conversionFactor = nullptr;
1184             break;
1185         }
1186
1187         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1188         for(uint32_t i = 0; i < 4; i++){
1189             if(!isComponentEnabled(compMask, i)){
1190                 continue;
1191             }
1192
1193             if(compCtrl[i] == ComponentControl::StoreSrc){
1194                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1195                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1196                 // if x or y, use vi128XY permute result, else use vi128ZW
1197                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1198
1199                 // sign extend
1200                 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1201
1202                 // denormalize if needed
1203                 if(conversionType != CONVERT_NONE){
1204                     vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1205                 }
1206                 currentVertexElement++;
1207             }
1208             else{
1209                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1210             }
1211
1212             if(currentVertexElement > 3){
1213                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1214                 // reset to the next vVertexElement to output
1215                 currentVertexElement = 0;
1216             }
1217         }
1218     }
1219     // else zero extend
1220     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1221     {
1222         // init denormalize variables if needed
1223         Instruction::CastOps fpCast;
1224         Value* conversionFactor;
1225
1226         switch (conversionType)
1227         {
1228         case CONVERT_NORMALIZED:
1229             fpCast = Instruction::CastOps::UIToFP;
1230             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1231             break;
1232         case CONVERT_USCALED:
1233             fpCast = Instruction::CastOps::UIToFP;
1234             conversionFactor = VIMMED1((float)(1.0));
1235             break;
1236         case CONVERT_SSCALED:
1237             SWR_ASSERT(0, "Type should not be zero extended!");
1238             conversionFactor = nullptr;
1239             break;
1240         default:
1241             SWR_ASSERT(conversionType == CONVERT_NONE);
1242             conversionFactor = nullptr;
1243             break;
1244         }
1245
1246         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1247         for(uint32_t i = 0; i < 4; i++){
1248             if(!isComponentEnabled(compMask, i)){
1249                 continue;
1250             }
1251
1252             if(compCtrl[i] == ComponentControl::StoreSrc){
1253                 // pshufb masks for each component
1254                 Value* vConstMask;
1255                 switch(swizzle[i]){
1256                     case 0:
1257                         // x shuffle mask
1258                         vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1259                                               0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1260                         break;
1261                     case 1:
1262                         // y shuffle mask
1263                         vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1264                                               1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1265                         break;
1266                     case 2:
1267                         // z shuffle mask
1268                         vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1269                                               2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1270                         break;
1271                     case 3:
1272                         // w shuffle mask
1273                         vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1274                                               3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1275                         break;
1276                     default:
1277                         vConstMask = nullptr;
1278                         break;
1279                 }
1280
1281                 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1282                 // after pshufb for x channel
1283                 // 256i - 0    1    2    3    4    5    6    7
1284                 //        x000 x000 x000 x000 x000 x000 x000 x000
1285
1286                 // denormalize if needed
1287                 if (conversionType != CONVERT_NONE){
1288                     vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1289                 }
1290                 currentVertexElement++;
1291             }
1292             else{
1293                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1294             }
1295
1296             if(currentVertexElement > 3){
1297                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1298                 // reset to the next vVertexElement to output
1299                 currentVertexElement = 0;
1300             }
1301         }
1302     }
1303     else
1304     {
1305         SWR_ASSERT(0, "Unsupported conversion type");
1306     }
1307 }
1308
1309 //////////////////////////////////////////////////////////////////////////
1310 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1311 /// denormalizes if needed, converts to F32 if needed, and positions in
1312 //  the proper SIMD rows to be output to the simdvertex structure
1313 /// @param args: (tuple of args, listed below)
1314 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1315 ///   @param pVtxOut - base pointer to output simdvertex struct
1316 ///   @param extendType - sign extend or zero extend
1317 ///   @param bNormalized - do we need to denormalize?
1318 ///   @param currentVertexElement - reference to the current vVertexElement
1319 ///   @param outputElt - reference to the current offset from simdvertex we're o
1320 ///   @param compMask - component packing mask
1321 ///   @param compCtrl - component control val
1322 ///   @param vVertexElements[4] - vertex components to output
1323 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1324 {
1325     // Unpack tuple args
1326     Value* (&vGatherResult)[2] = std::get<0>(args);
1327     Value* pVtxOut = std::get<1>(args);
1328     const Instruction::CastOps extendType = std::get<2>(args);
1329     const ConversionType conversionType = std::get<3>(args);
1330     uint32_t &currentVertexElement = std::get<4>(args);
1331     uint32_t &outputElt = std::get<5>(args);
1332     const ComponentEnable compMask = std::get<6>(args);
1333     const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1334     Value* (&vVertexElements)[4] = std::get<8>(args);
1335
1336     // cast types
1337     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1338     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1339
1340     // have to do extra work for sign extending
1341     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1342         (extendType == Instruction::CastOps::FPExt))
1343     {
1344         // is this PP float?
1345         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1346
1347         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1348         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1349
1350         // shuffle mask
1351         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1352                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1353         Value* vi128XY = nullptr;
1354         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1355             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1356             // after pshufb: group components together in each 128bit lane
1357             // 256i - 0    1    2    3    4    5    6    7
1358             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1359
1360             vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1361             // after PERMD: move and pack xy components into each 128bit lane
1362             // 256i - 0    1    2    3    4    5    6    7
1363             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1364         }
1365
1366         // do the same for zw components
1367         Value* vi128ZW = nullptr;
1368         if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1369             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1370             vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1371         }
1372
1373         // init denormalize variables if needed
1374         Instruction::CastOps IntToFpCast;
1375         Value* conversionFactor;
1376
1377         switch (conversionType)
1378         {
1379         case CONVERT_NORMALIZED:
1380             IntToFpCast = Instruction::CastOps::SIToFP;
1381             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1382             break;
1383         case CONVERT_SSCALED:
1384             IntToFpCast = Instruction::CastOps::SIToFP;
1385             conversionFactor = VIMMED1((float)(1.0));
1386             break;
1387         case CONVERT_USCALED:
1388             SWR_ASSERT(0, "Type should not be sign extended!");
1389             conversionFactor = nullptr;
1390             break;
1391         default:
1392             SWR_ASSERT(conversionType == CONVERT_NONE);
1393             conversionFactor = nullptr;
1394             break;
1395         }
1396
1397         // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1398         for(uint32_t i = 0; i < 4; i++){
1399             if(!isComponentEnabled(compMask, i)){
1400                 continue;
1401             }
1402
1403             if(compCtrl[i] == ComponentControl::StoreSrc){
1404                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1405                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1406                 // if x or y, use vi128XY permute result, else use vi128ZW
1407                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1408
1409                 if(bFP) {
1410                     // extract 128 bit lanes to sign extend each component
1411                     vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1412                 }
1413                 else {
1414                     // extract 128 bit lanes to sign extend each component
1415                     vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1416
1417                     // denormalize if needed
1418                     if(conversionType != CONVERT_NONE){
1419                         vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1420                     }
1421                 }
1422                 currentVertexElement++;
1423             }
1424             else{
1425                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1426             }
1427
1428             if(currentVertexElement > 3){
1429                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1430                 // reset to the next vVertexElement to output
1431                 currentVertexElement = 0;
1432             }
1433         }
1434
1435     }
1436     // else zero extend
1437     else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1438     {
1439         // pshufb masks for each component
1440         Value* vConstMask[2];
1441         if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1442             // x/z shuffle mask
1443             vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1444                                      0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1445         }
1446
1447         if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1448             // y/w shuffle mask
1449             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1450                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1451         }
1452
1453         // init denormalize variables if needed
1454         Instruction::CastOps fpCast;
1455         Value* conversionFactor;
1456
1457         switch (conversionType)
1458         {
1459         case CONVERT_NORMALIZED:
1460             fpCast = Instruction::CastOps::UIToFP;
1461             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1462             break;
1463         case CONVERT_USCALED:
1464             fpCast = Instruction::CastOps::UIToFP;
1465             conversionFactor = VIMMED1((float)(1.0f));
1466             break;
1467         case CONVERT_SSCALED:
1468             SWR_ASSERT(0, "Type should not be zero extended!");
1469             conversionFactor = nullptr;
1470             break;
1471         default:
1472             SWR_ASSERT(conversionType == CONVERT_NONE);
1473             conversionFactor = nullptr;
1474             break;
1475         }
1476
1477         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1478         for(uint32_t i = 0; i < 4; i++){
1479             if(!isComponentEnabled(compMask, i)){
1480                 continue;
1481             }
1482
1483             if(compCtrl[i] == ComponentControl::StoreSrc){
1484                 // select correct constMask for x/z or y/w pshufb
1485                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1486                 // if x or y, use vi128XY permute result, else use vi128ZW
1487                 uint32_t selectedGather = (i < 2) ? 0 : 1;
1488
1489                 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1490                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1491                 // 256i - 0    1    2    3    4    5    6    7
1492                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1493
1494                 // denormalize if needed
1495                 if(conversionType != CONVERT_NONE){
1496                     vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1497                 }
1498                 currentVertexElement++;
1499             }
1500             else{
1501                 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1502             }
1503
1504             if(currentVertexElement > 3){
1505                 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1506                 // reset to the next vVertexElement to output
1507                 currentVertexElement = 0;
1508             }
1509         }
1510     }
1511     else
1512     {
1513         SWR_ASSERT(0, "Unsupported conversion type");
1514     }
1515 }
1516
1517 //////////////////////////////////////////////////////////////////////////
1518 /// @brief Output a simdvertex worth of elements to the current outputElt
1519 /// @param pVtxOut - base address of VIN output struct
1520 /// @param outputElt - simdvertex offset in VIN to write to
1521 /// @param numEltsToStore - number of simdvertex rows to write out
1522 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1523 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1524 {
1525     for(uint32_t c = 0; c < numEltsToStore; ++c)
1526     {
1527         // STORE expects FP32 x vWidth type, just bitcast if needed
1528         if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1529 #if FETCH_DUMP_VERTEX
1530             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1531 #endif
1532             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1533         }
1534 #if FETCH_DUMP_VERTEX
1535         else
1536         {
1537             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1538         }
1539 #endif
1540         // outputElt * 4 = offsetting by the size of a simdvertex
1541         // + c offsets to a 32bit x vWidth row within the current vertex
1542         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1543         STORE(vVertexElements[c], dest);
1544     }
1545 }
1546
1547 //////////////////////////////////////////////////////////////////////////
1548 /// @brief Generates a constant vector of values based on the
1549 /// ComponentControl value
1550 /// @param ctrl - ComponentControl value
1551 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1552 {
1553     switch(ctrl)
1554     {
1555         case NoStore:   return VUNDEF_I();
1556         case Store0:    return VIMMED1(0);
1557         case Store1Fp:  return VIMMED1(1.0f);
1558         case Store1Int: return VIMMED1(1);
1559         case StoreSrc:
1560         default:        SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1561     }
1562 }
1563
1564 //////////////////////////////////////////////////////////////////////////
1565 /// @brief Returns the enable mask for the specified component.
1566 /// @param enableMask - enable bits
1567 /// @param component - component to check if enabled.
1568 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1569 {
1570     switch (component)
1571     {
1572         // X
1573     case 0: return (enableMask & ComponentEnable::X);
1574         // Y
1575     case 1: return (enableMask & ComponentEnable::Y);
1576         // Z
1577     case 2: return (enableMask & ComponentEnable::Z);
1578         // W
1579     case 3: return (enableMask & ComponentEnable::W);
1580
1581     default: return false;
1582     }
1583 }
1584
1585
1586 //////////////////////////////////////////////////////////////////////////
1587 /// @brief JITs from fetch shader IR
1588 /// @param hJitMgr - JitManager handle
1589 /// @param func   - LLVM function IR
1590 /// @return PFN_FETCH_FUNC - pointer to fetch code
1591 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1592 {
1593     const llvm::Function* func = (const llvm::Function*)hFunc;
1594     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1595     PFN_FETCH_FUNC pfnFetch;
1596
1597     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1598     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1599     pJitMgr->mIsModuleFinalized = true;
1600
1601 #if defined(KNOB_SWRC_TRACING)
1602     char fName[1024];
1603     const char *funcName = func->getName().data();
1604     sprintf(fName, "%s.bin", funcName);
1605     FILE *fd = fopen(fName, "wb");
1606     fwrite((void *)pfnFetch, 1, 2048, fd);
1607     fclose(fd);
1608 #endif
1609
1610     return pfnFetch;
1611 }
1612
1613 //////////////////////////////////////////////////////////////////////////
1614 /// @brief JIT compiles fetch shader
1615 /// @param hJitMgr - JitManager handle
1616 /// @param state   - fetch state to build function from
1617 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1618 {
1619     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1620
1621     pJitMgr->SetupNewModule();
1622
1623     FetchJit theJit(pJitMgr);
1624     HANDLE hFunc = theJit.Create(state);
1625
1626     return JitFetchFunc(hJitMgr, hFunc);
1627 }