1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation of the fetch jitter
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder_gfx_mem.h"
33 #include "fetch_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
37 //#define FETCH_DUMP_VERTEX 1
39 using namespace SwrJit
;
41 bool isComponentEnabled(ComponentEnable enableMask
, uint8_t component
);
52 //////////////////////////////////////////////////////////////////////////
53 /// Interface to Jitting a fetch shader
54 //////////////////////////////////////////////////////////////////////////
55 struct FetchJit
: public BuilderGfxMem
57 FetchJit(JitManager
* pJitMgr
) :
58 BuilderGfxMem(pJitMgr
)
61 Function
* Create(const FETCH_COMPILE_STATE
& fetchState
);
63 Value
* GetSimdValid32bitIndices(Value
* vIndices
, Value
* pLastIndex
);
64 Value
* GetSimdValid16bitIndices(Value
* vIndices
, Value
* pLastIndex
);
65 Value
* GetSimdValid8bitIndices(Value
* vIndices
, Value
* pLastIndex
);
66 template<typename T
> Value
* GetSimdValidIndicesHelper(Value
* pIndices
, Value
* pLastIndex
);
68 // package up Shuffle*bpcGatherd args into a tuple for convenience
69 typedef std::tuple
<Value
*&, Value
*, const Instruction::CastOps
, const ConversionType
,
70 uint32_t&, uint32_t&, const ComponentEnable
, const ComponentControl(&)[4], Value
*(&)[4],
71 const uint32_t(&)[4]> Shuffle8bpcArgs
;
73 void Shuffle8bpcGatherd16(Shuffle8bpcArgs
&args
);
74 void Shuffle8bpcGatherd(Shuffle8bpcArgs
&args
);
76 typedef std::tuple
<Value
*(&)[2], Value
*, const Instruction::CastOps
, const ConversionType
,
77 uint32_t&, uint32_t&, const ComponentEnable
, const ComponentControl(&)[4], Value
*(&)[4]> Shuffle16bpcArgs
;
79 void Shuffle16bpcGather16(Shuffle16bpcArgs
&args
);
80 void Shuffle16bpcGather(Shuffle16bpcArgs
&args
);
82 void StoreVertexElements(Value
* pVtxOut
, const uint32_t outputElt
, const uint32_t numEltsToStore
, Value
* (&vVertexElements
)[4]);
84 Value
*GenerateCompCtrlVector(const ComponentControl ctrl
);
86 void JitGatherVertices(const FETCH_COMPILE_STATE
&fetchState
, Value
* streams
, Value
* vIndices
, Value
* pVtxOut
);
88 bool IsOddFormat(SWR_FORMAT format
);
89 bool IsUniformFormat(SWR_FORMAT format
);
90 void UnpackComponents(SWR_FORMAT format
, Value
* vInput
, Value
* result
[4]);
91 void CreateGatherOddFormats(SWR_FORMAT format
, Value
* pMask
, Value
* pBase
, Value
* offsets
, Value
* result
[4]);
92 void ConvertFormat(SWR_FORMAT format
, Value
*texels
[4]);
97 Function
* FetchJit::Create(const FETCH_COMPILE_STATE
& fetchState
)
99 std::stringstream
fnName("FCH_", std::ios_base::in
| std::ios_base::out
| std::ios_base::ate
);
100 fnName
<< ComputeCRC(0, &fetchState
, sizeof(fetchState
));
102 Function
* fetch
= Function::Create(JM()->mFetchShaderTy
, GlobalValue::ExternalLinkage
, fnName
.str(), JM()->mpCurrentModule
);
103 BasicBlock
* entry
= BasicBlock::Create(JM()->mContext
, "entry", fetch
);
105 fetch
->getParent()->setModuleIdentifier(fetch
->getName());
107 IRB()->SetInsertPoint(entry
);
109 auto argitr
= fetch
->arg_begin();
111 // Fetch shader arguments
112 Value
* privateContext
= &*argitr
; ++argitr
;
113 privateContext
->setName("privateContext");
114 SetPrivateContext(privateContext
);
116 mpFetchInfo
= &*argitr
; ++argitr
;
117 mpFetchInfo
->setName("fetchInfo");
118 Value
* pVtxOut
= &*argitr
;
119 pVtxOut
->setName("vtxOutput");
121 uint32_t baseWidth
= mVWidth
;
123 SWR_ASSERT(mVWidth
== 8 || mVWidth
== 16, "Unsupported vector width %d", mVWidth
);
125 // Override builder target width to force 16-wide SIMD
126 #if USE_SIMD16_SHADERS
130 pVtxOut
= BITCAST(pVtxOut
, PointerType::get(mSimdFP32Ty
, 0));
132 // SWR_FETCH_CONTEXT::pStreams
133 Value
* streams
= LOAD(mpFetchInfo
,{0, SWR_FETCH_CONTEXT_pStreams
});
134 streams
->setName("pStreams");
136 // SWR_FETCH_CONTEXT::pIndices
137 Value
* indices
= LOAD(mpFetchInfo
,{0, SWR_FETCH_CONTEXT_xpIndices
});
138 indices
->setName("pIndices");
140 // SWR_FETCH_CONTEXT::pLastIndex
141 Value
* pLastIndex
= LOAD(mpFetchInfo
,{0, SWR_FETCH_CONTEXT_xpLastIndex
});
142 pLastIndex
->setName("pLastIndex");
145 switch(fetchState
.indexType
)
148 indices
= BITCAST(indices
, Type::getInt8PtrTy(JM()->mContext
, 0));
149 if(fetchState
.bDisableIndexOOBCheck
)
151 vIndices
= LOAD(BITCAST(indices
, PointerType::get(VectorType::get(mInt8Ty
, mpJitMgr
->mVWidth
), 0)), {(uint32_t)0});
152 vIndices
= Z_EXT(vIndices
, mSimdInt32Ty
);
156 vIndices
= GetSimdValid8bitIndices(indices
, pLastIndex
);
160 if(fetchState
.bDisableIndexOOBCheck
)
162 vIndices
= LOAD(BITCAST(indices
, PointerType::get(VectorType::get(mInt16Ty
, mpJitMgr
->mVWidth
), 0)), {(uint32_t)0});
163 vIndices
= Z_EXT(vIndices
, mSimdInt32Ty
);
167 vIndices
= GetSimdValid16bitIndices(indices
, pLastIndex
);
171 (fetchState
.bDisableIndexOOBCheck
) ? vIndices
= LOAD(indices
, "", PointerType::get(mSimdInt32Ty
, 0), GFX_MEM_CLIENT_FETCH
)
172 : vIndices
= GetSimdValid32bitIndices(indices
, pLastIndex
);
173 break; // incoming type is already 32bit int
175 SWR_INVALID("Unsupported index type");
180 if(fetchState
.bForceSequentialAccessEnable
)
182 Value
* pOffsets
= mVWidth
== 8 ? C({ 0, 1, 2, 3, 4, 5, 6, 7 }) :
183 C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
185 // VertexData buffers are accessed sequentially, the index is equal to the vertex number
186 vIndices
= VBROADCAST(LOAD(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_StartVertex
}));
187 vIndices
= ADD(vIndices
, pOffsets
);
190 Value
* vVertexId
= vIndices
;
191 if (fetchState
.bVertexIDOffsetEnable
)
193 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
194 Value
* vBaseVertex
= VBROADCAST(LOAD(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_BaseVertex
}));
195 Value
* vStartVertex
= VBROADCAST(LOAD(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_StartVertex
}));
196 vVertexId
= ADD(vIndices
, vBaseVertex
);
197 vVertexId
= ADD(vVertexId
, vStartVertex
);
200 // store out vertex IDs
203 // store out in simd8 halves until core supports 16-wide natively
204 auto vVertexIdLo
= EXTRACT_16(vVertexId
, 0);
205 auto vVertexIdHi
= EXTRACT_16(vVertexId
, 1);
206 STORE(vVertexIdLo
, GEP(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_VertexID
}));
207 STORE(vVertexIdHi
, GEP(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_VertexID2
}));
209 else if (mVWidth
== 8)
211 STORE(vVertexId
, GEP(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_VertexID
}));
214 // store out cut mask if enabled
215 if (fetchState
.bEnableCutIndex
)
217 Value
* vCutIndex
= VIMMED1(fetchState
.cutIndex
);
218 Value
* cutMask
= VMASK(ICMP_EQ(vIndices
, vCutIndex
));
222 auto cutMaskLo
= EXTRACT_16(cutMask
, 0);
223 auto cutMaskHi
= EXTRACT_16(cutMask
, 1);
224 STORE(cutMaskLo
, GEP(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_CutMask
}));
225 STORE(cutMaskHi
, GEP(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_CutMask2
}));
227 else if (mVWidth
== 8)
229 STORE(cutMask
, GEP(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_CutMask
}));
233 // Fetch attributes from memory and output to a simdvertex struct
234 JitGatherVertices(fetchState
, streams
, vIndices
, pVtxOut
);
238 JitManager::DumpToFile(fetch
, "src");
241 verifyFunction(*fetch
);
244 ::FunctionPassManager
setupPasses(JM()->mpCurrentModule
);
246 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
247 setupPasses
.add(createBreakCriticalEdgesPass());
248 setupPasses
.add(createCFGSimplificationPass());
249 setupPasses
.add(createEarlyCSEPass());
250 setupPasses
.add(createPromoteMemoryToRegisterPass());
252 setupPasses
.run(*fetch
);
254 JitManager::DumpToFile(fetch
, "se");
256 ::FunctionPassManager
optPasses(JM()->mpCurrentModule
);
258 ///@todo Haven't touched these either. Need to remove some of these and add others.
259 optPasses
.add(createCFGSimplificationPass());
260 optPasses
.add(createEarlyCSEPass());
261 optPasses
.add(createInstructionCombiningPass());
262 optPasses
.add(createInstructionSimplifierPass());
263 optPasses
.add(createConstantPropagationPass());
264 optPasses
.add(createSCCPPass());
265 optPasses
.add(createAggressiveDCEPass());
267 optPasses
.run(*fetch
);
269 optPasses
.add(createLowerX86Pass(JM(), this));
270 optPasses
.run(*fetch
);
272 JitManager::DumpToFile(fetch
, "opt");
275 // Revert 16-wide override
276 #if USE_SIMD16_SHADERS
277 SetTargetWidth(baseWidth
);
283 // returns true for odd formats that require special state.gather handling
284 bool FetchJit::IsOddFormat(SWR_FORMAT format
)
286 const SWR_FORMAT_INFO
& info
= GetFormatInfo(format
);
287 if (info
.bpc
[0] != 8 && info
.bpc
[0] != 16 && info
.bpc
[0] != 32 && info
.bpc
[0] != 64)
294 // format is uniform if all components are the same size and type
295 bool FetchJit::IsUniformFormat(SWR_FORMAT format
)
297 const SWR_FORMAT_INFO
& info
= GetFormatInfo(format
);
298 uint32_t bpc0
= info
.bpc
[0];
299 uint32_t type0
= info
.type
[0];
301 for (uint32_t c
= 1; c
< info
.numComps
; ++c
)
303 if (bpc0
!= info
.bpc
[c
] || type0
!= info
.type
[c
])
311 // unpacks components based on format
312 // foreach component in the pixel
313 // mask off everything but this component
314 // shift component to LSB
315 void FetchJit::UnpackComponents(SWR_FORMAT format
, Value
* vInput
, Value
* result
[4])
317 const SWR_FORMAT_INFO
& info
= GetFormatInfo(format
);
319 uint32_t bitOffset
= 0;
320 for (uint32_t c
= 0; c
< info
.numComps
; ++c
)
322 uint32_t swizzledIndex
= info
.swizzle
[c
];
323 uint32_t compBits
= info
.bpc
[c
];
324 uint32_t bitmask
= ((1 << compBits
) - 1) << bitOffset
;
325 Value
* comp
= AND(vInput
, bitmask
);
326 comp
= LSHR(comp
, bitOffset
);
328 result
[swizzledIndex
] = comp
;
329 bitOffset
+= compBits
;
333 // gather for odd component size formats
334 // gather SIMD full pixels per lane then shift/mask to move each component to their
336 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format
, Value
* pMask
, Value
* pBase
, Value
* pOffsets
, Value
* pResult
[4])
338 const SWR_FORMAT_INFO
&info
= GetFormatInfo(format
);
340 // only works if pixel size is <= 32bits
341 SWR_ASSERT(info
.bpp
<= 32);
346 pGather
= GATHERDD(VIMMED1(0), pBase
, pOffsets
, pMask
);
350 // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
351 Value
*pMem
= ALLOCA(mSimdInt32Ty
);
352 STORE(VIMMED1(0u), pMem
);
354 pBase
= BITCAST(pBase
, PointerType::get(mInt8Ty
, 0));
355 Value
* pDstMem
= BITCAST(pMem
, mInt32PtrTy
);
357 for (uint32_t lane
= 0; lane
< mVWidth
; ++lane
)
360 Value
* index
= VEXTRACT(pOffsets
, C(lane
));
361 Value
* mask
= VEXTRACT(pMask
, C(lane
));
366 Value
* pDst
= BITCAST(GEP(pDstMem
, C(lane
)), PointerType::get(mInt8Ty
, 0));
367 Value
* pSrc
= BITCAST(GEP(pBase
, index
), PointerType::get(mInt8Ty
, 0));
368 STORE(LOAD(SELECT(mask
, pSrc
, pDst
)), pDst
);
374 Value
* pDst
= BITCAST(GEP(pDstMem
, C(lane
)), PointerType::get(mInt16Ty
, 0));
375 Value
* pSrc
= BITCAST(GEP(pBase
, index
), PointerType::get(mInt16Ty
, 0));
376 STORE(LOAD(SELECT(mask
, pSrc
, pDst
)), pDst
);
383 // First 16-bits of data
384 Value
* pDst
= BITCAST(GEP(pDstMem
, C(lane
)), PointerType::get(mInt16Ty
, 0));
385 Value
* pSrc
= BITCAST(GEP(pBase
, index
), PointerType::get(mInt16Ty
, 0));
386 STORE(LOAD(SELECT(mask
, pSrc
, pDst
)), pDst
);
388 // Last 8-bits of data
389 pDst
= BITCAST(GEP(pDst
, C(1)), PointerType::get(mInt8Ty
, 0));
390 pSrc
= BITCAST(GEP(pSrc
, C(1)), PointerType::get(mInt8Ty
, 0));
391 STORE(LOAD(SELECT(mask
, pSrc
, pDst
)), pDst
);
396 SWR_INVALID("Shouldn't have BPP = %d now", info
.bpp
);
401 pGather
= LOAD(pMem
);
404 for (uint32_t comp
= 0; comp
< 4; ++comp
)
406 pResult
[comp
] = VIMMED1((int)info
.defaults
[comp
]);
409 UnpackComponents(format
, pGather
, pResult
);
412 pResult
[0] = BITCAST(pResult
[0], mSimdFP32Ty
);
413 pResult
[1] = BITCAST(pResult
[1], mSimdFP32Ty
);
414 pResult
[2] = BITCAST(pResult
[2], mSimdFP32Ty
);
415 pResult
[3] = BITCAST(pResult
[3], mSimdFP32Ty
);
418 void FetchJit::ConvertFormat(SWR_FORMAT format
, Value
*texels
[4])
420 const SWR_FORMAT_INFO
&info
= GetFormatInfo(format
);
422 for (uint32_t c
= 0; c
< info
.numComps
; ++c
)
424 uint32_t compIndex
= info
.swizzle
[c
];
426 // skip any conversion on UNUSED components
427 if (info
.type
[c
] == SWR_TYPE_UNUSED
)
432 if (info
.isNormalized
[c
])
434 if (info
.type
[c
] == SWR_TYPE_SNORM
)
436 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
438 /// result = c * (1.0f / (2^(n-1) - 1);
439 uint32_t n
= info
.bpc
[c
];
440 uint32_t pow2
= 1 << (n
- 1);
441 float scale
= 1.0f
/ (float)(pow2
- 1);
442 Value
*vScale
= VIMMED1(scale
);
443 texels
[compIndex
] = BITCAST(texels
[compIndex
], mSimdInt32Ty
);
444 texels
[compIndex
] = SI_TO_FP(texels
[compIndex
], mSimdFP32Ty
);
445 texels
[compIndex
] = FMUL(texels
[compIndex
], vScale
);
449 SWR_ASSERT(info
.type
[c
] == SWR_TYPE_UNORM
);
451 /// result = c * (1.0f / (2^n - 1))
452 uint32_t n
= info
.bpc
[c
];
453 uint32_t pow2
= 1 << n
;
454 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
457 float scale
= (float)(pow2
- 1);
458 Value
* vScale
= VIMMED1(scale
);
459 texels
[compIndex
] = BITCAST(texels
[compIndex
], mSimdInt32Ty
);
460 texels
[compIndex
] = SI_TO_FP(texels
[compIndex
], mSimdFP32Ty
);
461 texels
[compIndex
] = FDIV(texels
[compIndex
], vScale
);
465 float scale
= 1.0f
/ (float)(pow2
- 1);
466 Value
*vScale
= VIMMED1(scale
);
467 texels
[compIndex
] = BITCAST(texels
[compIndex
], mSimdInt32Ty
);
468 texels
[compIndex
] = UI_TO_FP(texels
[compIndex
], mSimdFP32Ty
);
469 texels
[compIndex
] = FMUL(texels
[compIndex
], vScale
);
477 //////////////////////////////////////////////////////////////////////////
478 /// @brief Loads attributes from memory using AVX2 GATHER(s)
479 /// @param fetchState - info about attributes to be fetched from memory
480 /// @param streams - value pointer to the current vertex stream
481 /// @param vIndices - vector value of indices to gather
482 /// @param pVtxOut - value pointer to output simdvertex struct
483 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE
&fetchState
,
484 Value
* streams
, Value
* vIndices
, Value
* pVtxOut
)
486 uint32_t currentVertexElement
= 0;
487 uint32_t outputElt
= 0;
488 Value
* vVertexElements
[4];
490 Value
* startVertex
= LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_StartVertex
});
491 Value
* startInstance
= LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_StartInstance
});
492 Value
* curInstance
= LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_CurInstance
});
493 Value
* vBaseVertex
= VBROADCAST(LOAD(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_BaseVertex
}));
494 curInstance
->setName("curInstance");
496 for (uint32_t nInputElt
= 0; nInputElt
< fetchState
.numAttribs
; nInputElt
+= 1)
498 const INPUT_ELEMENT_DESC
& ied
= fetchState
.layout
[nInputElt
];
500 // skip element if all components are disabled
501 if (ied
.ComponentPacking
== ComponentEnable::NONE
)
506 const SWR_FORMAT_INFO
&info
= GetFormatInfo((SWR_FORMAT
)ied
.Format
);
507 SWR_ASSERT((info
.bpp
!= 0), "Unsupported format in JitGatherVertices.");
508 uint32_t bpc
= info
.bpp
/ info
.numComps
; ///@todo Code below assumes all components are same size. Need to fix.
510 Value
*stream
= LOAD(streams
, {ied
.StreamIndex
, SWR_VERTEX_BUFFER_STATE_xpData
});
512 // VGATHER* takes an *i8 src pointer
513 Value
*pStreamBase
= INT_TO_PTR(stream
, PointerType::get(mInt8Ty
, 0));
515 Value
*stride
= LOAD(streams
, {ied
.StreamIndex
, SWR_VERTEX_BUFFER_STATE_pitch
});
516 Value
*vStride
= VBROADCAST(stride
);
518 // max vertex index that is fully in bounds
519 Value
*maxVertex
= GEP(streams
, {C(ied
.StreamIndex
), C(SWR_VERTEX_BUFFER_STATE_maxVertex
)});
520 maxVertex
= LOAD(maxVertex
);
522 Value
*minVertex
= NULL
;
523 if (fetchState
.bPartialVertexBuffer
)
525 // min vertex index for low bounds OOB checking
526 minVertex
= GEP(streams
, {C(ied
.StreamIndex
), C(SWR_VERTEX_BUFFER_STATE_minVertex
)});
527 minVertex
= LOAD(minVertex
);
530 if (fetchState
.bInstanceIDOffsetEnable
)
532 // the InstanceID (curInstance) value is offset by StartInstanceLocation
533 curInstance
= ADD(curInstance
, startInstance
);
538 Value
*vInstanceStride
= VIMMED1(0);
540 if (ied
.InstanceEnable
)
542 Value
* stepRate
= C(ied
.InstanceAdvancementState
);
544 // prevent a div by 0 for 0 step rate
545 Value
* isNonZeroStep
= ICMP_UGT(stepRate
, C(0));
546 stepRate
= SELECT(isNonZeroStep
, stepRate
, C(1));
548 // calc the current offset into instanced data buffer
549 Value
* calcInstance
= UDIV(curInstance
, stepRate
);
551 // if step rate is 0, every instance gets instance 0
552 calcInstance
= SELECT(isNonZeroStep
, calcInstance
, C(0));
554 vCurIndices
= VBROADCAST(calcInstance
);
555 startOffset
= startInstance
;
557 else if (ied
.InstanceStrideEnable
)
559 // grab the instance advancement state, determines stride in bytes from one instance to the next
560 Value
* stepRate
= C(ied
.InstanceAdvancementState
);
561 vInstanceStride
= VBROADCAST(MUL(curInstance
, stepRate
));
563 // offset indices by baseVertex
564 vCurIndices
= ADD(vIndices
, vBaseVertex
);
566 startOffset
= startVertex
;
567 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
571 // offset indices by baseVertex
572 vCurIndices
= ADD(vIndices
, vBaseVertex
);
573 startOffset
= startVertex
;
576 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
577 // do 64bit address offset calculations.
579 // calculate byte offset to the start of the VB
580 Value
* baseOffset
= MUL(Z_EXT(startOffset
, mInt64Ty
), Z_EXT(stride
, mInt64Ty
));
581 pStreamBase
= GEP(pStreamBase
, baseOffset
);
582 Value
* pStreamBaseGFX
= ADD(stream
, baseOffset
);
584 // if we have a start offset, subtract from max vertex. Used for OOB check
585 maxVertex
= SUB(Z_EXT(maxVertex
, mInt64Ty
), Z_EXT(startOffset
, mInt64Ty
));
586 Value
* maxNeg
= ICMP_SLT(maxVertex
, C((int64_t)0));
587 // if we have a negative value, we're already OOB. clamp at 0.
588 maxVertex
= SELECT(maxNeg
, C(0), TRUNC(maxVertex
, mInt32Ty
));
590 if (fetchState
.bPartialVertexBuffer
)
592 // similary for min vertex
593 minVertex
= SUB(Z_EXT(minVertex
, mInt64Ty
), Z_EXT(startOffset
, mInt64Ty
));
594 Value
*minNeg
= ICMP_SLT(minVertex
, C((int64_t)0));
595 minVertex
= SELECT(minNeg
, C(0), TRUNC(minVertex
, mInt32Ty
));
598 // Load the in bounds size of a partially valid vertex
599 Value
*partialInboundsSize
= GEP(streams
, {C(ied
.StreamIndex
), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize
)});
600 partialInboundsSize
= LOAD(partialInboundsSize
);
601 Value
*vPartialVertexSize
= VBROADCAST(partialInboundsSize
);
602 Value
*vBpp
= VBROADCAST(C(info
.Bpp
));
603 Value
*vAlignmentOffsets
= VBROADCAST(C(ied
.AlignedByteOffset
));
605 // is the element is <= the partially valid size
606 Value
*vElementInBoundsMask
= ICMP_SLE(vBpp
, SUB(vPartialVertexSize
, vAlignmentOffsets
));
608 // override cur indices with 0 if pitch is 0
609 Value
* pZeroPitchMask
= ICMP_EQ(vStride
, VIMMED1(0));
610 vCurIndices
= SELECT(pZeroPitchMask
, VIMMED1(0), vCurIndices
);
612 // are vertices partially OOB?
613 Value
* vMaxVertex
= VBROADCAST(maxVertex
);
614 Value
* vPartialOOBMask
= ICMP_EQ(vCurIndices
, vMaxVertex
);
616 // are vertices fully in bounds?
617 Value
* vMaxGatherMask
= ICMP_ULT(vCurIndices
, vMaxVertex
);
620 if (fetchState
.bPartialVertexBuffer
)
622 // are vertices below minVertex limit?
623 Value
*vMinVertex
= VBROADCAST(minVertex
);
624 Value
*vMinGatherMask
= ICMP_UGE(vCurIndices
, vMinVertex
);
626 // only fetch lanes that pass both tests
627 vGatherMask
= AND(vMaxGatherMask
, vMinGatherMask
);
631 vGatherMask
= vMaxGatherMask
;
634 // blend in any partially OOB indices that have valid elements
635 vGatherMask
= SELECT(vPartialOOBMask
, vElementInBoundsMask
, vGatherMask
);
637 // calculate the actual offsets into the VB
638 Value
* vOffsets
= MUL(vCurIndices
, vStride
);
639 vOffsets
= ADD(vOffsets
, vAlignmentOffsets
);
641 // if instance stride enable is:
642 // true - add product of the instanceID and advancement state to the offst into the VB
643 // false - value of vInstanceStride has been initialialized to zero
644 vOffsets
= ADD(vOffsets
, vInstanceStride
);
646 // Packing and component control
647 ComponentEnable compMask
= (ComponentEnable
)ied
.ComponentPacking
;
648 const ComponentControl compCtrl
[4] { (ComponentControl
)ied
.ComponentControl0
, (ComponentControl
)ied
.ComponentControl1
,
649 (ComponentControl
)ied
.ComponentControl2
, (ComponentControl
)ied
.ComponentControl3
};
651 // Special gather/conversion for formats without equal component sizes
652 if (IsOddFormat((SWR_FORMAT
)ied
.Format
))
655 CreateGatherOddFormats((SWR_FORMAT
)ied
.Format
, vGatherMask
, pStreamBase
, vOffsets
, pResults
);
656 ConvertFormat((SWR_FORMAT
)ied
.Format
, pResults
);
658 for (uint32_t c
= 0; c
< 4; c
+= 1)
660 if (isComponentEnabled(compMask
, c
))
662 vVertexElements
[currentVertexElement
++] = pResults
[c
];
663 if (currentVertexElement
> 3)
665 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
666 // reset to the next vVertexElement to output
667 currentVertexElement
= 0;
672 else if(info
.type
[0] == SWR_TYPE_FLOAT
)
674 ///@todo: support 64 bit vb accesses
675 Value
*gatherSrc
= VIMMED1(0.0f
);
677 SWR_ASSERT(IsUniformFormat((SWR_FORMAT
)ied
.Format
),
678 "Unsupported format for standard gather fetch.");
680 // Gather components from memory to store in a simdvertex structure
685 Value
*vGatherResult
[2];
687 // if we have at least one component out of x or y to fetch
688 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 1))
690 vGatherResult
[0] = GATHERPS(gatherSrc
, pStreamBase
, vOffsets
, vGatherMask
);
691 // e.g. result of first 8x32bit integer gather for 16bit components
692 // 256i - 0 1 2 3 4 5 6 7
693 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
697 // if we have at least one component out of z or w to fetch
698 if (isComponentEnabled(compMask
, 2) || isComponentEnabled(compMask
, 3))
700 // offset base to the next components(zw) in the vertex to gather
701 pStreamBase
= GEP(pStreamBase
, C((char)4));
703 vGatherResult
[1] = GATHERPS(gatherSrc
, pStreamBase
, vOffsets
, vGatherMask
);
704 // e.g. result of second 8x32bit integer gather for 16bit components
705 // 256i - 0 1 2 3 4 5 6 7
706 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
710 // if we have at least one component to shuffle into place
713 Shuffle16bpcArgs args
= std::forward_as_tuple(vGatherResult
, pVtxOut
, Instruction::CastOps::FPExt
, CONVERT_NONE
,
714 currentVertexElement
, outputElt
, compMask
, compCtrl
, vVertexElements
);
716 // Shuffle gathered components into place in simdvertex struct
717 mVWidth
== 16 ? Shuffle16bpcGather16(args
) : Shuffle16bpcGather(args
); // outputs to vVertexElements ref
723 for (uint32_t i
= 0; i
< 4; i
+= 1)
725 if (isComponentEnabled(compMask
, i
))
727 // if we need to gather the component
728 if (compCtrl
[i
] == StoreSrc
)
730 // Gather a SIMD of vertices
731 // APIs allow a 4GB range for offsets
732 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
733 // But, we know that elements must be aligned for FETCH. :)
734 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
735 Value
*vShiftedOffsets
= LSHR(vOffsets
, 1);
736 vVertexElements
[currentVertexElement
++] = GATHERPS(gatherSrc
, pStreamBaseGFX
, vShiftedOffsets
, vGatherMask
, 2, GFX_MEM_CLIENT_FETCH
);
740 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
743 if (currentVertexElement
> 3)
745 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
746 // reset to the next vVertexElement to output
747 currentVertexElement
= 0;
751 // offset base to the next component in the vertex to gather
752 pStreamBase
= GEP(pStreamBase
, C((char)4));
753 pStreamBaseGFX
= ADD(pStreamBaseGFX
, C((int64_t)4));
759 for (uint32_t i
= 0; i
< 4; i
+= 1)
761 if (isComponentEnabled(compMask
, i
))
763 // if we need to gather the component
764 if (compCtrl
[i
] == StoreSrc
)
772 vShufLo
= C({ 0, 1, 2, 3 });
773 vShufHi
= C({ 4, 5, 6, 7 });
774 vShufAll
= C({ 0, 1, 2, 3, 4, 5, 6, 7 });
778 SWR_ASSERT(mVWidth
== 16);
779 vShufLo
= C({ 0, 1, 2, 3, 4, 5, 6, 7 });
780 vShufHi
= C({ 8, 9, 10, 11, 12, 13, 14, 15 });
781 vShufAll
= C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
784 Value
*vMaskLo
= VSHUFFLE(vGatherMask
, vGatherMask
, vShufLo
);
785 Value
*vMaskHi
= VSHUFFLE(vGatherMask
, vGatherMask
, vShufHi
);
787 Value
*vOffsetsLo
= VSHUFFLE(vOffsets
, vOffsets
, vShufLo
);
788 Value
*vOffsetsHi
= VSHUFFLE(vOffsets
, vOffsets
, vShufHi
);
790 Value
*vZeroDouble
= VECTOR_SPLAT(mVWidth
/ 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f
));
792 Value
* pGatherLo
= GATHERPD(vZeroDouble
, pStreamBase
, vOffsetsLo
, vMaskLo
);
793 Value
* pGatherHi
= GATHERPD(vZeroDouble
, pStreamBase
, vOffsetsHi
, vMaskHi
);
795 pGatherLo
= VCVTPD2PS(pGatherLo
);
796 pGatherHi
= VCVTPD2PS(pGatherHi
);
798 Value
*pGather
= VSHUFFLE(pGatherLo
, pGatherHi
, vShufAll
);
800 vVertexElements
[currentVertexElement
++] = pGather
;
804 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
807 if (currentVertexElement
> 3)
809 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
810 // reset to the next vVertexElement to output
811 currentVertexElement
= 0;
815 // offset base to the next component in the vertex to gather
816 pStreamBase
= GEP(pStreamBase
, C((char)8));
821 SWR_INVALID("Tried to fetch invalid FP format");
827 Instruction::CastOps extendCastType
= Instruction::CastOps::CastOpsEnd
;
828 ConversionType conversionType
= CONVERT_NONE
;
830 SWR_ASSERT(IsUniformFormat((SWR_FORMAT
)ied
.Format
),
831 "Unsupported format for standard gather fetch.");
836 conversionType
= CONVERT_NORMALIZED
;
838 extendCastType
= Instruction::CastOps::ZExt
;
841 conversionType
= CONVERT_NORMALIZED
;
843 extendCastType
= Instruction::CastOps::SExt
;
845 case SWR_TYPE_USCALED
:
846 conversionType
= CONVERT_USCALED
;
847 extendCastType
= Instruction::CastOps::UIToFP
;
849 case SWR_TYPE_SSCALED
:
850 conversionType
= CONVERT_SSCALED
;
851 extendCastType
= Instruction::CastOps::SIToFP
;
853 case SWR_TYPE_SFIXED
:
854 conversionType
= CONVERT_SFIXED
;
855 extendCastType
= Instruction::CastOps::SExt
;
861 // value substituted when component of gather is masked
862 Value
* gatherSrc
= VIMMED1(0);
864 // Gather components from memory to store in a simdvertex structure
869 // if we have at least one component to fetch
872 Value
*vGatherResult
= GATHERDD(gatherSrc
, pStreamBase
, vOffsets
, vGatherMask
);
873 // e.g. result of an 8x32bit integer gather for 8bit components
874 // 256i - 0 1 2 3 4 5 6 7
875 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
877 Shuffle8bpcArgs args
= std::forward_as_tuple(vGatherResult
, pVtxOut
, extendCastType
, conversionType
,
878 currentVertexElement
, outputElt
, compMask
, compCtrl
, vVertexElements
, info
.swizzle
);
880 // Shuffle gathered components into place in simdvertex struct
881 mVWidth
== 16 ? Shuffle8bpcGatherd16(args
) : Shuffle8bpcGatherd(args
); // outputs to vVertexElements ref
887 Value
*vGatherResult
[2];
889 // if we have at least one component out of x or y to fetch
890 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 1))
892 vGatherResult
[0] = GATHERDD(gatherSrc
, pStreamBase
, vOffsets
, vGatherMask
);
893 // e.g. result of first 8x32bit integer gather for 16bit components
894 // 256i - 0 1 2 3 4 5 6 7
895 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
899 // if we have at least one component out of z or w to fetch
900 if (isComponentEnabled(compMask
, 2) || isComponentEnabled(compMask
, 3))
902 // offset base to the next components(zw) in the vertex to gather
903 pStreamBase
= GEP(pStreamBase
, C((char)4));
905 vGatherResult
[1] = GATHERDD(gatherSrc
, pStreamBase
, vOffsets
, vGatherMask
);
906 // e.g. result of second 8x32bit integer gather for 16bit components
907 // 256i - 0 1 2 3 4 5 6 7
908 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
912 // if we have at least one component to shuffle into place
915 Shuffle16bpcArgs args
= std::forward_as_tuple(vGatherResult
, pVtxOut
, extendCastType
, conversionType
,
916 currentVertexElement
, outputElt
, compMask
, compCtrl
, vVertexElements
);
918 // Shuffle gathered components into place in simdvertex struct
919 mVWidth
== 16 ? Shuffle16bpcGather16(args
) : Shuffle16bpcGather(args
); // outputs to vVertexElements ref
925 // Gathered components into place in simdvertex struct
926 for (uint32_t i
= 0; i
< 4; i
++)
928 if (isComponentEnabled(compMask
, i
))
930 // if we need to gather the component
931 if (compCtrl
[i
] == StoreSrc
)
933 Value
* pGather
= GATHERDD(gatherSrc
, pStreamBase
, vOffsets
, vGatherMask
);
935 if (conversionType
== CONVERT_USCALED
)
937 pGather
= UI_TO_FP(pGather
, mSimdFP32Ty
);
939 else if (conversionType
== CONVERT_SSCALED
)
941 pGather
= SI_TO_FP(pGather
, mSimdFP32Ty
);
943 else if (conversionType
== CONVERT_SFIXED
)
945 pGather
= FMUL(SI_TO_FP(pGather
, mSimdFP32Ty
), VBROADCAST(C(1/65536.0f
)));
948 vVertexElements
[currentVertexElement
++] = pGather
;
950 // e.g. result of a single 8x32bit integer gather for 32bit components
951 // 256i - 0 1 2 3 4 5 6 7
952 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
956 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
959 if (currentVertexElement
> 3)
961 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
963 // reset to the next vVertexElement to output
964 currentVertexElement
= 0;
969 // offset base to the next component in the vertex to gather
970 pStreamBase
= GEP(pStreamBase
, C((char)4));
978 // if we have a partially filled vVertexElement struct, output it
979 if (currentVertexElement
> 0)
981 StoreVertexElements(pVtxOut
, outputElt
++, currentVertexElement
, vVertexElements
);
985 typedef void*(*PFN_TRANSLATEGFXADDRESS_FUNC
)(void* pdc
, gfxptr_t va
);
986 extern "C" void GetSimdValid8bitIndicesGfx(gfxptr_t indices
, gfxptr_t lastIndex
, uint32_t vWidth
, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate
, void* pdc
, uint32_t* outIndices
);
987 extern "C" void GetSimdValid16bitIndicesGfx(gfxptr_t indices
, gfxptr_t lastIndex
, uint32_t vWidth
, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate
, void* pdc
, uint32_t* outIndices
);
989 template<typename T
> Value
* FetchJit::GetSimdValidIndicesHelper(Value
* pIndices
, Value
* pLastIndex
)
991 SWR_ASSERT(pIndices
->getType() == mInt64Ty
&& pLastIndex
->getType() == mInt64Ty
, "Function expects gfxptr_t for both input parameters.");
995 static_assert(sizeof(T
) == sizeof(uint16_t) || sizeof(T
) == sizeof(uint8_t), "Unsupported type for use with GetSimdValidIndicesHelper<T>");
996 constexpr bool bSize
= (sizeof(T
) == sizeof(uint16_t));
1001 else if (sizeof(T
) == sizeof(uint8_t))
1007 SWR_ASSERT(false, "This should never happen as per static_assert above.");
1010 Value
* vIndices
= VUNDEF_I();
1013 // store 0 index on stack to be used to conditionally load from if index address is OOB
1014 Value
* pZeroIndex
= ALLOCA(Ty
);
1015 STORE(C((T
)0), pZeroIndex
);
1017 // Load a SIMD of index pointers
1018 for (int64_t lane
= 0; lane
< mVWidth
; lane
++)
1020 // Calculate the address of the requested index
1021 Value
*pIndex
= GEP(pIndices
, C(lane
), Ty
);
1023 pLastIndex
= INT_TO_PTR(pLastIndex
, Ty
);
1025 // check if the address is less than the max index,
1026 Value
* mask
= ICMP_ULT(pIndex
, pLastIndex
);
1028 // if valid, load the index. if not, load 0 from the stack
1029 Value
* pValid
= SELECT(mask
, pIndex
, pZeroIndex
);
1030 Value
*index
= LOAD(pValid
, "valid index", Ty
, GFX_MEM_CLIENT_FETCH
);
1032 // zero extended index to 32 bits and insert into the correct simd lane
1033 index
= Z_EXT(index
, mInt32Ty
);
1034 vIndices
= VINSERT(vIndices
, index
, lane
);
1041 //////////////////////////////////////////////////////////////////////////
1042 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1043 /// *Note* have to do 8bit index checking in scalar until we have AVX-512
1045 /// @param pIndices - pointer to 8 bit indices
1046 /// @param pLastIndex - pointer to last valid index
1047 Value
* FetchJit::GetSimdValid8bitIndices(Value
* pIndices
, Value
* pLastIndex
)
1049 return GetSimdValidIndicesHelper
<uint8_t>(pIndices
, pLastIndex
);
1052 //////////////////////////////////////////////////////////////////////////
1053 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1054 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1056 /// @param pIndices - pointer to 16 bit indices
1057 /// @param pLastIndex - pointer to last valid index
1058 Value
* FetchJit::GetSimdValid16bitIndices(Value
* pIndices
, Value
* pLastIndex
)
1060 return GetSimdValidIndicesHelper
<uint16_t>(pIndices
, pLastIndex
);
1063 //////////////////////////////////////////////////////////////////////////
1064 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1065 /// @param pIndices - pointer to 32 bit indices
1066 /// @param pLastIndex - pointer to last valid index
1067 Value
* FetchJit::GetSimdValid32bitIndices(Value
* pIndices
, Value
* pLastIndex
)
1069 DataLayout
dL(JM()->mpCurrentModule
);
1070 unsigned int ptrSize
= dL
.getPointerSize() * 8; // ptr size in bits
1071 Value
* iLastIndex
= pLastIndex
;
1072 Value
* iIndices
= pIndices
;
1074 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1075 Value
* numIndicesLeft
= SUB(iLastIndex
,iIndices
);
1076 numIndicesLeft
= TRUNC(numIndicesLeft
, mInt32Ty
);
1077 numIndicesLeft
= SDIV(numIndicesLeft
, C(4));
1079 // create a vector of index counts from the base index ptr passed into the fetch
1080 Constant
* vIndexOffsets
;
1083 vIndexOffsets
= C({ 0, 1, 2, 3, 4, 5, 6, 7 });
1087 vIndexOffsets
= C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
1090 // compare index count to the max valid index
1091 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1092 // vIndexOffsets 0 1 2 3 4 5 6 7
1093 // ------------------------------
1094 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1095 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1096 Value
* vMaxIndex
= VBROADCAST(numIndicesLeft
);
1097 Value
* vIndexMask
= ICMP_SGT(vMaxIndex
, vIndexOffsets
);
1099 // Load the indices; OOB loads 0
1100 pIndices
= BITCAST(pIndices
, PointerType::get(mSimdInt32Ty
, 0));
1101 return MASKED_LOAD(pIndices
, 4, vIndexMask
, VIMMED1(0));
1104 //////////////////////////////////////////////////////////////////////////
1105 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1106 /// denormalizes if needed, converts to F32 if needed, and positions in
1107 // the proper SIMD rows to be output to the simdvertex structure
1108 /// @param args: (tuple of args, listed below)
1109 /// @param vGatherResult - 8 gathered 8bpc vertices
1110 /// @param pVtxOut - base pointer to output simdvertex struct
1111 /// @param extendType - sign extend or zero extend
1112 /// @param bNormalized - do we need to denormalize?
1113 /// @param currentVertexElement - reference to the current vVertexElement
1114 /// @param outputElt - reference to the current offset from simdvertex we're o
1115 /// @param compMask - component packing mask
1116 /// @param compCtrl - component control val
1117 /// @param vVertexElements[4] - vertex components to output
1118 /// @param swizzle[4] - component swizzle location
1119 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs
&args
)
1121 // Unpack tuple args
1122 Value
*& vGatherResult
= std::get
<0>(args
);
1123 Value
* pVtxOut
= std::get
<1>(args
);
1124 const Instruction::CastOps extendType
= std::get
<2>(args
);
1125 const ConversionType conversionType
= std::get
<3>(args
);
1126 uint32_t ¤tVertexElement
= std::get
<4>(args
);
1127 uint32_t &outputElt
= std::get
<5>(args
);
1128 const ComponentEnable compMask
= std::get
<6>(args
);
1129 const ComponentControl(&compCtrl
)[4] = std::get
<7>(args
);
1130 Value
* (&vVertexElements
)[4] = std::get
<8>(args
);
1131 const uint32_t(&swizzle
)[4] = std::get
<9>(args
);
1134 Type
*vGatherTy
= VectorType::get(mInt32Ty
, 8);
1135 Type
*v32x8Ty
= VectorType::get(mInt8Ty
, 32);
1137 // have to do extra work for sign extending
1138 if ((extendType
== Instruction::CastOps::SExt
) || (extendType
== Instruction::CastOps::SIToFP
))
1140 Type
*v16x8Ty
= VectorType::get(mInt8Ty
, 16); // 8x16bit ints in a 128bit lane
1141 Type
*v128Ty
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), 2);
1143 // shuffle mask, including any swizzling
1144 const char x
= (char)swizzle
[0]; const char y
= (char)swizzle
[1];
1145 const char z
= (char)swizzle
[2]; const char w
= (char)swizzle
[3];
1146 Value
*vConstMask
= C
<char>({ char(x
), char(x
+ 4), char(x
+ 8), char(x
+ 12),
1147 char(y
), char(y
+ 4), char(y
+ 8), char(y
+ 12),
1148 char(z
), char(z
+ 4), char(z
+ 8), char(z
+ 12),
1149 char(w
), char(w
+ 4), char(w
+ 8), char(w
+ 12),
1150 char(x
), char(x
+ 4), char(x
+ 8), char(x
+ 12),
1151 char(y
), char(y
+ 4), char(y
+ 8), char(y
+ 12),
1152 char(z
), char(z
+ 4), char(z
+ 8), char(z
+ 12),
1153 char(w
), char(w
+ 4), char(w
+ 8), char(w
+ 12) });
1155 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1157 Value
*vGatherResult_lo
= EXTRACT_16(vGatherResult
, 0);
1158 Value
*vGatherResult_hi
= EXTRACT_16(vGatherResult
, 1);
1160 Value
*vShufResult_lo
= BITCAST(PSHUFB(BITCAST(vGatherResult_lo
, v32x8Ty
), vConstMask
), vGatherTy
);
1161 Value
*vShufResult_hi
= BITCAST(PSHUFB(BITCAST(vGatherResult_hi
, v32x8Ty
), vConstMask
), vGatherTy
);
1163 // after pshufb: group components together in each 128bit lane
1164 // 256i - 0 1 2 3 4 5 6 7
1165 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1167 Value
*vi128XY_lo
= nullptr;
1168 Value
*vi128XY_hi
= nullptr;
1169 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 1))
1171 vi128XY_lo
= BITCAST(VSHUFFLE(vShufResult_lo
, vShufResult_lo
, C
<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty
);
1172 vi128XY_hi
= BITCAST(VSHUFFLE(vShufResult_hi
, vShufResult_hi
, C
<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty
);
1174 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1175 // 256i - 0 1 2 3 4 5 6 7
1176 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1179 // do the same for zw components
1180 Value
*vi128ZW_lo
= nullptr;
1181 Value
*vi128ZW_hi
= nullptr;
1182 if (isComponentEnabled(compMask
, 2) || isComponentEnabled(compMask
, 3))
1184 vi128ZW_lo
= BITCAST(VSHUFFLE(vShufResult_lo
, vShufResult_lo
, C
<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty
);
1185 vi128ZW_hi
= BITCAST(VSHUFFLE(vShufResult_hi
, vShufResult_hi
, C
<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty
);
1188 // init denormalize variables if needed
1189 Instruction::CastOps fpCast
;
1190 Value
*conversionFactor
;
1192 switch (conversionType
)
1194 case CONVERT_NORMALIZED
:
1195 fpCast
= Instruction::CastOps::SIToFP
;
1196 conversionFactor
= VIMMED1((float)(1.0 / 127.0));
1198 case CONVERT_SSCALED
:
1199 fpCast
= Instruction::CastOps::SIToFP
;
1200 conversionFactor
= VIMMED1((float)(1.0));
1202 case CONVERT_USCALED
:
1203 SWR_INVALID("Type should not be sign extended!");
1204 conversionFactor
= nullptr;
1207 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1208 conversionFactor
= nullptr;
1212 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1213 for (uint32_t i
= 0; i
< 4; i
++)
1215 if (isComponentEnabled(compMask
, i
))
1217 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1219 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1220 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1221 // if x or y, use vi128XY permute result, else use vi128ZW
1222 Value
*selectedPermute_lo
= (i
< 2) ? vi128XY_lo
: vi128ZW_lo
;
1223 Value
*selectedPermute_hi
= (i
< 2) ? vi128XY_hi
: vi128ZW_hi
;
1226 Value
*temp_lo
= PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo
, C(lane
)), v16x8Ty
));
1227 Value
*temp_hi
= PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi
, C(lane
)), v16x8Ty
));
1229 Value
* temp
= JOIN_16(temp_lo
, temp_hi
);
1231 // denormalize if needed
1232 if (conversionType
!= CONVERT_NONE
)
1234 temp
= FMUL(CAST(fpCast
, temp
, mSimdFP32Ty
), conversionFactor
);
1237 vVertexElements
[currentVertexElement
] = temp
;
1239 currentVertexElement
+= 1;
1243 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1246 if (currentVertexElement
> 3)
1248 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1249 // reset to the next vVertexElement to output
1250 currentVertexElement
= 0;
1256 else if ((extendType
== Instruction::CastOps::ZExt
) || (extendType
== Instruction::CastOps::UIToFP
))
1258 // init denormalize variables if needed
1259 Instruction::CastOps fpCast
;
1260 Value
*conversionFactor
;
1262 switch (conversionType
)
1264 case CONVERT_NORMALIZED
:
1265 fpCast
= Instruction::CastOps::UIToFP
;
1266 conversionFactor
= VIMMED1((float)(1.0 / 255.0));
1268 case CONVERT_USCALED
:
1269 fpCast
= Instruction::CastOps::UIToFP
;
1270 conversionFactor
= VIMMED1((float)(1.0));
1272 case CONVERT_SSCALED
:
1273 SWR_INVALID("Type should not be zero extended!");
1274 conversionFactor
= nullptr;
1277 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1278 conversionFactor
= nullptr;
1282 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1283 for (uint32_t i
= 0; i
< 4; i
++)
1285 if (isComponentEnabled(compMask
, i
))
1287 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1289 // pshufb masks for each component
1295 vConstMask
= C
<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1296 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1300 vConstMask
= C
<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1301 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1305 vConstMask
= C
<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1306 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1310 vConstMask
= C
<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1311 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1314 vConstMask
= nullptr;
1318 Value
*vGatherResult_lo
= EXTRACT_16(vGatherResult
, 0);
1319 Value
*vGatherResult_hi
= EXTRACT_16(vGatherResult
, 1);
1321 Value
*temp_lo
= BITCAST(PSHUFB(BITCAST(vGatherResult_lo
, v32x8Ty
), vConstMask
), vGatherTy
);
1322 Value
*temp_hi
= BITCAST(PSHUFB(BITCAST(vGatherResult_hi
, v32x8Ty
), vConstMask
), vGatherTy
);
1324 // after pshufb for x channel
1325 // 256i - 0 1 2 3 4 5 6 7
1326 // x000 x000 x000 x000 x000 x000 x000 x000
1328 Value
* temp
= JOIN_16(temp_lo
, temp_hi
);
1330 // denormalize if needed
1331 if (conversionType
!= CONVERT_NONE
)
1333 temp
= FMUL(CAST(fpCast
, temp
, mSimdFP32Ty
), conversionFactor
);
1336 vVertexElements
[currentVertexElement
] = temp
;
1338 currentVertexElement
+= 1;
1342 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1345 if (currentVertexElement
> 3)
1347 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1348 // reset to the next vVertexElement to output
1349 currentVertexElement
= 0;
1356 SWR_INVALID("Unsupported conversion type");
1360 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs
&args
)
1362 // Unpack tuple args
1363 Value
*& vGatherResult
= std::get
<0>(args
);
1364 Value
* pVtxOut
= std::get
<1>(args
);
1365 const Instruction::CastOps extendType
= std::get
<2>(args
);
1366 const ConversionType conversionType
= std::get
<3>(args
);
1367 uint32_t ¤tVertexElement
= std::get
<4>(args
);
1368 uint32_t &outputElt
= std::get
<5>(args
);
1369 const ComponentEnable compMask
= std::get
<6>(args
);
1370 const ComponentControl(&compCtrl
)[4] = std::get
<7>(args
);
1371 Value
* (&vVertexElements
)[4] = std::get
<8>(args
);
1372 const uint32_t(&swizzle
)[4] = std::get
<9>(args
);
1375 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
1377 for (uint32_t i
= 0; i
< 4; i
++)
1379 if (!isComponentEnabled(compMask
, i
))
1382 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1384 std::vector
<uint32_t> vShuffleMasks
[4] = {
1385 { 0, 4, 8, 12, 16, 20, 24, 28 }, // x
1386 { 1, 5, 9, 13, 17, 21, 25, 29 }, // y
1387 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
1388 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
1391 Value
*val
= VSHUFFLE(BITCAST(vGatherResult
, v32x8Ty
),
1392 UndefValue::get(v32x8Ty
),
1393 vShuffleMasks
[swizzle
[i
]]);
1395 if ((extendType
== Instruction::CastOps::SExt
) ||
1396 (extendType
== Instruction::CastOps::SIToFP
)) {
1397 switch (conversionType
)
1399 case CONVERT_NORMALIZED
:
1400 val
= FMUL(SI_TO_FP(val
, mSimdFP32Ty
), VIMMED1((float)(1.0 / 127.0)));
1402 case CONVERT_SSCALED
:
1403 val
= SI_TO_FP(val
, mSimdFP32Ty
);
1405 case CONVERT_USCALED
:
1406 SWR_INVALID("Type should not be sign extended!");
1409 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1410 val
= S_EXT(val
, mSimdInt32Ty
);
1414 else if ((extendType
== Instruction::CastOps::ZExt
) ||
1415 (extendType
== Instruction::CastOps::UIToFP
)) {
1416 switch (conversionType
)
1418 case CONVERT_NORMALIZED
:
1419 val
= FMUL(UI_TO_FP(val
, mSimdFP32Ty
), VIMMED1((float)(1.0 / 255.0)));
1421 case CONVERT_SSCALED
:
1422 SWR_INVALID("Type should not be zero extended!");
1424 case CONVERT_USCALED
:
1425 val
= UI_TO_FP(val
, mSimdFP32Ty
);
1428 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1429 val
= Z_EXT(val
, mSimdInt32Ty
);
1435 SWR_INVALID("Unsupported conversion type");
1438 vVertexElements
[currentVertexElement
++] = val
;
1442 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1445 if (currentVertexElement
> 3)
1447 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1448 // reset to the next vVertexElement to output
1449 currentVertexElement
= 0;
1454 //////////////////////////////////////////////////////////////////////////
1455 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1456 /// denormalizes if needed, converts to F32 if needed, and positions in
1457 // the proper SIMD rows to be output to the simdvertex structure
1458 /// @param args: (tuple of args, listed below)
1459 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1460 /// @param pVtxOut - base pointer to output simdvertex struct
1461 /// @param extendType - sign extend or zero extend
1462 /// @param bNormalized - do we need to denormalize?
1463 /// @param currentVertexElement - reference to the current vVertexElement
1464 /// @param outputElt - reference to the current offset from simdvertex we're o
1465 /// @param compMask - component packing mask
1466 /// @param compCtrl - component control val
1467 /// @param vVertexElements[4] - vertex components to output
1468 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs
&args
)
1470 // Unpack tuple args
1471 Value
* (&vGatherResult
)[2] = std::get
<0>(args
);
1472 Value
* pVtxOut
= std::get
<1>(args
);
1473 const Instruction::CastOps extendType
= std::get
<2>(args
);
1474 const ConversionType conversionType
= std::get
<3>(args
);
1475 uint32_t ¤tVertexElement
= std::get
<4>(args
);
1476 uint32_t &outputElt
= std::get
<5>(args
);
1477 const ComponentEnable compMask
= std::get
<6>(args
);
1478 const ComponentControl(&compCtrl
)[4] = std::get
<7>(args
);
1479 Value
* (&vVertexElements
)[4] = std::get
<8>(args
);
1482 Type
*vGatherTy
= VectorType::get(mInt32Ty
, 8);
1483 Type
*v32x8Ty
= VectorType::get(mInt8Ty
, 32);
1485 // have to do extra work for sign extending
1486 if ((extendType
== Instruction::CastOps::SExt
) || (extendType
== Instruction::CastOps::SIToFP
) || (extendType
== Instruction::CastOps::FPExt
))
1488 // is this PP float?
1489 bool bFP
= (extendType
== Instruction::CastOps::FPExt
) ? true : false;
1491 Type
*v8x16Ty
= VectorType::get(mInt16Ty
, 8); // 8x16bit in a 128bit lane
1492 Type
*v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), 2);
1495 Value
*vConstMask
= C
<uint8_t>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1496 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1497 Value
*vi128XY_lo
= nullptr;
1498 Value
*vi128XY_hi
= nullptr;
1499 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 1))
1501 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1503 Value
*vGatherResult_lo
= BITCAST(EXTRACT_16(vGatherResult
[0], 0), v32x8Ty
);
1504 Value
*vGatherResult_hi
= BITCAST(EXTRACT_16(vGatherResult
[0], 1), v32x8Ty
);
1506 Value
*vShufResult_lo
= BITCAST(PSHUFB(vGatherResult_lo
, vConstMask
), vGatherTy
);
1507 Value
*vShufResult_hi
= BITCAST(PSHUFB(vGatherResult_hi
, vConstMask
), vGatherTy
);
1509 // after pshufb: group components together in each 128bit lane
1510 // 256i - 0 1 2 3 4 5 6 7
1511 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1513 vi128XY_lo
= BITCAST(VSHUFFLE(vShufResult_lo
, vShufResult_lo
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
1514 vi128XY_hi
= BITCAST(VSHUFFLE(vShufResult_hi
, vShufResult_hi
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
1516 // after PERMD: move and pack xy components into each 128bit lane
1517 // 256i - 0 1 2 3 4 5 6 7
1518 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1521 // do the same for zw components
1522 Value
*vi128ZW_lo
= nullptr;
1523 Value
*vi128ZW_hi
= nullptr;
1524 if (isComponentEnabled(compMask
, 2) || isComponentEnabled(compMask
, 3))
1526 Value
*vGatherResult_lo
= BITCAST(EXTRACT_16(vGatherResult
[1], 0), v32x8Ty
);
1527 Value
*vGatherResult_hi
= BITCAST(EXTRACT_16(vGatherResult
[1], 1), v32x8Ty
);
1529 Value
*vShufResult_lo
= BITCAST(PSHUFB(vGatherResult_lo
, vConstMask
), vGatherTy
);
1530 Value
*vShufResult_hi
= BITCAST(PSHUFB(vGatherResult_hi
, vConstMask
), vGatherTy
);
1532 vi128ZW_lo
= BITCAST(VSHUFFLE(vShufResult_lo
, vShufResult_lo
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
1533 vi128ZW_hi
= BITCAST(VSHUFFLE(vShufResult_hi
, vShufResult_hi
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
1536 // init denormalize variables if needed
1537 Instruction::CastOps IntToFpCast
;
1538 Value
*conversionFactor
;
1540 switch (conversionType
)
1542 case CONVERT_NORMALIZED
:
1543 IntToFpCast
= Instruction::CastOps::SIToFP
;
1544 conversionFactor
= VIMMED1((float)(1.0 / 32767.0));
1546 case CONVERT_SSCALED
:
1547 IntToFpCast
= Instruction::CastOps::SIToFP
;
1548 conversionFactor
= VIMMED1((float)(1.0));
1550 case CONVERT_USCALED
:
1551 SWR_INVALID("Type should not be sign extended!");
1552 conversionFactor
= nullptr;
1555 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1556 conversionFactor
= nullptr;
1560 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1561 for (uint32_t i
= 0; i
< 4; i
++)
1563 if (isComponentEnabled(compMask
, i
))
1565 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1567 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1568 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1569 // if x or y, use vi128XY permute result, else use vi128ZW
1570 Value
*selectedPermute_lo
= (i
< 2) ? vi128XY_lo
: vi128ZW_lo
;
1571 Value
*selectedPermute_hi
= (i
< 2) ? vi128XY_hi
: vi128ZW_hi
;
1575 // extract 128 bit lanes to sign extend each component
1576 Value
*temp_lo
= CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo
, C(lane
)), v8x16Ty
));
1577 Value
*temp_hi
= CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi
, C(lane
)), v8x16Ty
));
1579 vVertexElements
[currentVertexElement
] = JOIN_16(temp_lo
, temp_hi
);
1583 // extract 128 bit lanes to sign extend each component
1584 Value
*temp_lo
= PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo
, C(lane
)), v8x16Ty
));
1585 Value
*temp_hi
= PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi
, C(lane
)), v8x16Ty
));
1587 Value
* temp
= JOIN_16(temp_lo
, temp_hi
);
1589 // denormalize if needed
1590 if (conversionType
!= CONVERT_NONE
)
1592 temp
= FMUL(CAST(IntToFpCast
, temp
, mSimdFP32Ty
), conversionFactor
);
1595 vVertexElements
[currentVertexElement
] = temp
;
1598 currentVertexElement
+= 1;
1602 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1605 if (currentVertexElement
> 3)
1607 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1608 // reset to the next vVertexElement to output
1609 currentVertexElement
= 0;
1615 else if ((extendType
== Instruction::CastOps::ZExt
) || (extendType
== Instruction::CastOps::UIToFP
))
1617 // pshufb masks for each component
1618 Value
*vConstMask
[2];
1620 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 2))
1623 vConstMask
[0] = C
<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1624 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1627 if (isComponentEnabled(compMask
, 1) || isComponentEnabled(compMask
, 3))
1630 vConstMask
[1] = C
<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1631 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1634 // init denormalize variables if needed
1635 Instruction::CastOps fpCast
;
1636 Value
* conversionFactor
;
1638 switch (conversionType
)
1640 case CONVERT_NORMALIZED
:
1641 fpCast
= Instruction::CastOps::UIToFP
;
1642 conversionFactor
= VIMMED1((float)(1.0 / 65535.0));
1644 case CONVERT_USCALED
:
1645 fpCast
= Instruction::CastOps::UIToFP
;
1646 conversionFactor
= VIMMED1((float)(1.0f
));
1648 case CONVERT_SSCALED
:
1649 SWR_INVALID("Type should not be zero extended!");
1650 conversionFactor
= nullptr;
1653 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1654 conversionFactor
= nullptr;
1658 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1659 for (uint32_t i
= 0; i
< 4; i
++)
1661 if (isComponentEnabled(compMask
, i
))
1663 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1665 // select correct constMask for x/z or y/w pshufb
1666 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1667 // if x or y, use vi128XY permute result, else use vi128ZW
1668 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
1670 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1672 Value
*vGatherResult_lo
= EXTRACT_16(vGatherResult
[selectedGather
], 0);
1673 Value
*vGatherResult_hi
= EXTRACT_16(vGatherResult
[selectedGather
], 1);
1675 Value
*temp_lo
= BITCAST(PSHUFB(BITCAST(vGatherResult_lo
, v32x8Ty
), vConstMask
[selectedMask
]), vGatherTy
);
1676 Value
*temp_hi
= BITCAST(PSHUFB(BITCAST(vGatherResult_hi
, v32x8Ty
), vConstMask
[selectedMask
]), vGatherTy
);
1678 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1679 // 256i - 0 1 2 3 4 5 6 7
1680 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1682 Value
* temp
= JOIN_16(temp_lo
, temp_hi
);
1684 // denormalize if needed
1685 if (conversionType
!= CONVERT_NONE
)
1687 temp
= FMUL(CAST(fpCast
, temp
, mSimdFP32Ty
), conversionFactor
);
1690 vVertexElements
[currentVertexElement
] = temp
;
1692 currentVertexElement
+= 1;
1696 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1699 if (currentVertexElement
> 3)
1701 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1702 // reset to the next vVertexElement to output
1703 currentVertexElement
= 0;
1710 SWR_INVALID("Unsupported conversion type");
1714 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs
&args
)
1716 // Unpack tuple args
1717 Value
* (&vGatherResult
)[2] = std::get
<0>(args
);
1718 Value
* pVtxOut
= std::get
<1>(args
);
1719 const Instruction::CastOps extendType
= std::get
<2>(args
);
1720 const ConversionType conversionType
= std::get
<3>(args
);
1721 uint32_t ¤tVertexElement
= std::get
<4>(args
);
1722 uint32_t &outputElt
= std::get
<5>(args
);
1723 const ComponentEnable compMask
= std::get
<6>(args
);
1724 const ComponentControl(&compCtrl
)[4] = std::get
<7>(args
);
1725 Value
* (&vVertexElements
)[4] = std::get
<8>(args
);
1728 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
1729 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
1731 // have to do extra work for sign extending
1732 if ((extendType
== Instruction::CastOps::SExt
) || (extendType
== Instruction::CastOps::SIToFP
) ||
1733 (extendType
== Instruction::CastOps::FPExt
))
1735 // is this PP float?
1736 bool bFP
= (extendType
== Instruction::CastOps::FPExt
) ? true : false;
1738 Type
* v8x16Ty
= VectorType::get(mInt16Ty
, 8); // 8x16bit in a 128bit lane
1739 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
1742 Value
* vConstMask
= C
<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1743 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1744 Value
* vi128XY
= nullptr;
1745 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 1)) {
1746 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherResult
[0], v32x8Ty
), vConstMask
), vGatherTy
);
1747 // after pshufb: group components together in each 128bit lane
1748 // 256i - 0 1 2 3 4 5 6 7
1749 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1751 vi128XY
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
1752 // after PERMD: move and pack xy components into each 128bit lane
1753 // 256i - 0 1 2 3 4 5 6 7
1754 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1757 // do the same for zw components
1758 Value
* vi128ZW
= nullptr;
1759 if (isComponentEnabled(compMask
, 2) || isComponentEnabled(compMask
, 3)) {
1760 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherResult
[1], v32x8Ty
), vConstMask
), vGatherTy
);
1761 vi128ZW
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
1764 // init denormalize variables if needed
1765 Instruction::CastOps IntToFpCast
;
1766 Value
* conversionFactor
;
1768 switch (conversionType
)
1770 case CONVERT_NORMALIZED
:
1771 IntToFpCast
= Instruction::CastOps::SIToFP
;
1772 conversionFactor
= VIMMED1((float)(1.0 / 32767.0));
1774 case CONVERT_SSCALED
:
1775 IntToFpCast
= Instruction::CastOps::SIToFP
;
1776 conversionFactor
= VIMMED1((float)(1.0));
1778 case CONVERT_USCALED
:
1779 SWR_INVALID("Type should not be sign extended!");
1780 conversionFactor
= nullptr;
1783 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1784 conversionFactor
= nullptr;
1788 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1789 for (uint32_t i
= 0; i
< 4; i
++)
1791 if (isComponentEnabled(compMask
, i
))
1793 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1795 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1796 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1797 // if x or y, use vi128XY permute result, else use vi128ZW
1798 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
1801 // extract 128 bit lanes to sign extend each component
1802 vVertexElements
[currentVertexElement
] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute
, C(lane
)), v8x16Ty
));
1805 // extract 128 bit lanes to sign extend each component
1806 vVertexElements
[currentVertexElement
] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute
, C(lane
)), v8x16Ty
));
1808 // denormalize if needed
1809 if (conversionType
!= CONVERT_NONE
) {
1810 vVertexElements
[currentVertexElement
] = FMUL(CAST(IntToFpCast
, vVertexElements
[currentVertexElement
], mSimdFP32Ty
), conversionFactor
);
1813 currentVertexElement
++;
1817 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1820 if (currentVertexElement
> 3)
1822 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1823 // reset to the next vVertexElement to output
1824 currentVertexElement
= 0;
1830 else if ((extendType
== Instruction::CastOps::ZExt
) || (extendType
== Instruction::CastOps::UIToFP
))
1832 // pshufb masks for each component
1833 Value
* vConstMask
[2];
1834 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 2)) {
1836 vConstMask
[0] = C
<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1837 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1840 if (isComponentEnabled(compMask
, 1) || isComponentEnabled(compMask
, 3)) {
1842 vConstMask
[1] = C
<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1843 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1846 // init denormalize variables if needed
1847 Instruction::CastOps fpCast
;
1848 Value
* conversionFactor
;
1850 switch (conversionType
)
1852 case CONVERT_NORMALIZED
:
1853 fpCast
= Instruction::CastOps::UIToFP
;
1854 conversionFactor
= VIMMED1((float)(1.0 / 65535.0));
1856 case CONVERT_USCALED
:
1857 fpCast
= Instruction::CastOps::UIToFP
;
1858 conversionFactor
= VIMMED1((float)(1.0f
));
1860 case CONVERT_SSCALED
:
1861 SWR_INVALID("Type should not be zero extended!");
1862 conversionFactor
= nullptr;
1865 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1866 conversionFactor
= nullptr;
1870 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1871 for (uint32_t i
= 0; i
< 4; i
++)
1873 if (isComponentEnabled(compMask
, i
))
1875 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1877 // select correct constMask for x/z or y/w pshufb
1878 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1879 // if x or y, use vi128XY permute result, else use vi128ZW
1880 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
1882 vVertexElements
[currentVertexElement
] = BITCAST(PSHUFB(BITCAST(vGatherResult
[selectedGather
], v32x8Ty
), vConstMask
[selectedMask
]), vGatherTy
);
1883 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1884 // 256i - 0 1 2 3 4 5 6 7
1885 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1887 // denormalize if needed
1888 if (conversionType
!= CONVERT_NONE
)
1890 vVertexElements
[currentVertexElement
] = FMUL(CAST(fpCast
, vVertexElements
[currentVertexElement
], mSimdFP32Ty
), conversionFactor
);
1892 currentVertexElement
++;
1896 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1899 if (currentVertexElement
> 3)
1901 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1902 // reset to the next vVertexElement to output
1903 currentVertexElement
= 0;
1910 SWR_INVALID("Unsupported conversion type");
1914 //////////////////////////////////////////////////////////////////////////
1915 /// @brief Output a simdvertex worth of elements to the current outputElt
1916 /// @param pVtxOut - base address of VIN output struct
1917 /// @param outputElt - simdvertex offset in VIN to write to
1918 /// @param numEltsToStore - number of simdvertex rows to write out
1919 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1920 void FetchJit::StoreVertexElements(Value
* pVtxOut
, const uint32_t outputElt
, const uint32_t numEltsToStore
, Value
* (&vVertexElements
)[4])
1922 SWR_ASSERT(numEltsToStore
<= 4, "Invalid element count.");
1924 for (uint32_t c
= 0; c
< numEltsToStore
; ++c
)
1926 // STORE expects FP32 x vWidth type, just bitcast if needed
1927 if (!vVertexElements
[c
]->getType()->getScalarType()->isFloatTy())
1929 #if FETCH_DUMP_VERTEX
1930 PRINT("vVertexElements[%d]: 0x%x\n", { C(c
), vVertexElements
[c
] });
1932 vVertexElements
[c
] = BITCAST(vVertexElements
[c
], mSimdFP32Ty
);
1934 #if FETCH_DUMP_VERTEX
1937 PRINT("vVertexElements[%d]: %f\n", { C(c
), vVertexElements
[c
] });
1940 // outputElt * 4 = offsetting by the size of a simdvertex
1941 // + c offsets to a 32bit x vWidth row within the current vertex
1942 Value
* dest
= GEP(pVtxOut
, C(outputElt
* 4 + c
), nullptr, "destGEP");
1943 STORE(vVertexElements
[c
], dest
);
1947 //////////////////////////////////////////////////////////////////////////
1948 /// @brief Generates a constant vector of values based on the
1949 /// ComponentControl value
1950 /// @param ctrl - ComponentControl value
1951 Value
*FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl
)
1960 return VIMMED1(1.0f
);
1967 Type
* pSimd8FPTy
= VectorType::get(mFP32Ty
, 8);
1968 Value
*pIdLo
= BITCAST(LOAD(GEP(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_VertexID
})), pSimd8FPTy
);
1969 Value
*pIdHi
= BITCAST(LOAD(GEP(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_VertexID2
})), pSimd8FPTy
);
1970 return JOIN_16(pIdLo
, pIdHi
);
1974 return BITCAST(LOAD(GEP(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_VertexID
})), mSimdFP32Ty
);
1977 case StoreInstanceId
:
1979 Value
*pId
= BITCAST(LOAD(GEP(mpFetchInfo
, { 0, SWR_FETCH_CONTEXT_CurInstance
})), mFP32Ty
);
1980 return VBROADCAST(pId
);
1986 SWR_INVALID("Invalid component control");
1991 //////////////////////////////////////////////////////////////////////////
1992 /// @brief Returns the enable mask for the specified component.
1993 /// @param enableMask - enable bits
1994 /// @param component - component to check if enabled.
1995 bool isComponentEnabled(ComponentEnable enableMask
, uint8_t component
)
2000 case 0: return (enableMask
& ComponentEnable::X
);
2002 case 1: return (enableMask
& ComponentEnable::Y
);
2004 case 2: return (enableMask
& ComponentEnable::Z
);
2006 case 3: return (enableMask
& ComponentEnable::W
);
2008 default: return false;
2012 // Don't want two threads compiling the same fetch shader simultaneously
2013 // Has problems in the JIT cache implementation
2014 // This is only a problem for fetch right now.
2015 static std::mutex gFetchCodegenMutex
;
2017 //////////////////////////////////////////////////////////////////////////
2018 /// @brief JITs from fetch shader IR
2019 /// @param hJitMgr - JitManager handle
2020 /// @param func - LLVM function IR
2021 /// @return PFN_FETCH_FUNC - pointer to fetch code
2022 PFN_FETCH_FUNC
JitFetchFunc(HANDLE hJitMgr
, const HANDLE hFunc
)
2024 const llvm::Function
* func
= (const llvm::Function
*)hFunc
;
2025 JitManager
* pJitMgr
= reinterpret_cast<JitManager
*>(hJitMgr
);
2026 PFN_FETCH_FUNC pfnFetch
;
2028 gFetchCodegenMutex
.lock();
2029 pfnFetch
= (PFN_FETCH_FUNC
)(pJitMgr
->mpExec
->getFunctionAddress(func
->getName().str()));
2030 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2031 pJitMgr
->mIsModuleFinalized
= true;
2033 #if defined(KNOB_SWRC_TRACING)
2035 const char *funcName
= func
->getName().data();
2036 sprintf(fName
, "%s.bin", funcName
);
2037 FILE *fd
= fopen(fName
, "wb");
2038 fwrite((void *)pfnFetch
, 1, 2048, fd
);
2042 pJitMgr
->DumpAsm(const_cast<llvm::Function
*>(func
), "final");
2043 gFetchCodegenMutex
.unlock();
2050 //////////////////////////////////////////////////////////////////////////
2051 /// @brief JIT compiles fetch shader
2052 /// @param hJitMgr - JitManager handle
2053 /// @param state - fetch state to build function from
2054 extern "C" PFN_FETCH_FUNC JITCALL
JitCompileFetch(HANDLE hJitMgr
, const FETCH_COMPILE_STATE
& state
)
2056 JitManager
* pJitMgr
= reinterpret_cast<JitManager
*>(hJitMgr
);
2058 pJitMgr
->SetupNewModule();
2060 FetchJit
theJit(pJitMgr
);
2061 HANDLE hFunc
= theJit
.Create(state
);
2063 return JitFetchFunc(hJitMgr
, hFunc
);