1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation of the fetch jitter
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder_gfx_mem.h"
33 #include "fetch_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
37 //#define FETCH_DUMP_VERTEX 1
39 using namespace SwrJit
;
41 bool isComponentEnabled(ComponentEnable enableMask
, uint8_t component
);
52 //////////////////////////////////////////////////////////////////////////
53 /// Interface to Jitting a fetch shader
54 //////////////////////////////////////////////////////////////////////////
55 struct FetchJit
: public BuilderGfxMem
57 FetchJit(JitManager
* pJitMgr
) : BuilderGfxMem(pJitMgr
) {}
59 Function
* Create(const FETCH_COMPILE_STATE
& fetchState
);
61 Value
* GetSimdValid32bitIndices(Value
* vIndices
, Value
* pLastIndex
);
62 Value
* GetSimdValid16bitIndices(Value
* vIndices
, Value
* pLastIndex
);
63 Value
* GetSimdValid8bitIndices(Value
* vIndices
, Value
* pLastIndex
);
65 Value
* GetSimdValidIndicesHelper(Value
* pIndices
, Value
* pLastIndex
);
67 // package up Shuffle*bpcGatherd args into a tuple for convenience
68 typedef std::tuple
<Value
*&,
70 const Instruction::CastOps
,
74 const ComponentEnable
,
75 const ComponentControl (&)[4],
77 const uint32_t (&)[4]>
80 void Shuffle8bpcGatherd16(Shuffle8bpcArgs
& args
);
81 void Shuffle8bpcGatherd(Shuffle8bpcArgs
& args
);
83 typedef std::tuple
<Value
* (&)[2],
85 const Instruction::CastOps
,
89 const ComponentEnable
,
90 const ComponentControl (&)[4],
94 void Shuffle16bpcGather16(Shuffle16bpcArgs
& args
);
95 void Shuffle16bpcGather(Shuffle16bpcArgs
& args
);
97 void StoreVertexElements(Value
* pVtxOut
,
98 const uint32_t outputElt
,
99 const uint32_t numEltsToStore
,
100 Value
* (&vVertexElements
)[4]);
102 Value
* GenerateCompCtrlVector(const ComponentControl ctrl
);
104 void JitGatherVertices(const FETCH_COMPILE_STATE
& fetchState
,
109 bool IsOddFormat(SWR_FORMAT format
);
110 bool IsUniformFormat(SWR_FORMAT format
);
111 void UnpackComponents(SWR_FORMAT format
, Value
* vInput
, Value
* result
[4]);
112 void CreateGatherOddFormats(
113 SWR_FORMAT format
, Value
* pMask
, Value
* pBase
, Value
* offsets
, Value
* result
[4]);
114 void ConvertFormat(SWR_FORMAT format
, Value
* texels
[4]);
119 Function
* FetchJit::Create(const FETCH_COMPILE_STATE
& fetchState
)
121 std::stringstream
fnName("FCH_", std::ios_base::in
| std::ios_base::out
| std::ios_base::ate
);
122 fnName
<< ComputeCRC(0, &fetchState
, sizeof(fetchState
));
124 Function
* fetch
= Function::Create(
125 JM()->mFetchShaderTy
, GlobalValue::ExternalLinkage
, fnName
.str(), JM()->mpCurrentModule
);
126 BasicBlock
* entry
= BasicBlock::Create(JM()->mContext
, "entry", fetch
);
128 fetch
->getParent()->setModuleIdentifier(fetch
->getName());
130 IRB()->SetInsertPoint(entry
);
132 auto argitr
= fetch
->arg_begin();
134 // Fetch shader arguments
135 Value
* privateContext
= &*argitr
;
137 privateContext
->setName("privateContext");
138 SetPrivateContext(privateContext
);
140 mpWorkerData
= &*argitr
;
142 mpWorkerData
->setName("pWorkerData");
144 mpFetchInfo
= &*argitr
;
146 mpFetchInfo
->setName("fetchInfo");
147 Value
* pVtxOut
= &*argitr
;
148 pVtxOut
->setName("vtxOutput");
150 uint32_t baseWidth
= mVWidth
;
152 SWR_ASSERT(mVWidth
== 8 || mVWidth
== 16, "Unsupported vector width %d", mVWidth
);
154 // Override builder target width to force 16-wide SIMD
155 #if USE_SIMD16_SHADERS
159 pVtxOut
= BITCAST(pVtxOut
, PointerType::get(mSimdFP32Ty
, 0));
161 // SWR_FETCH_CONTEXT::pStreams
162 Value
* streams
= LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_pStreams
});
163 streams
->setName("pStreams");
165 // SWR_FETCH_CONTEXT::pIndices
166 Value
* indices
= LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_xpIndices
});
167 indices
->setName("pIndices");
169 // SWR_FETCH_CONTEXT::pLastIndex
170 Value
* pLastIndex
= LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_xpLastIndex
});
171 pLastIndex
->setName("pLastIndex");
174 switch (fetchState
.indexType
)
177 indices
= BITCAST(indices
, Type::getInt8PtrTy(JM()->mContext
, 0));
178 if (fetchState
.bDisableIndexOOBCheck
)
181 BITCAST(indices
, PointerType::get(VectorType::get(mInt8Ty
, mpJitMgr
->mVWidth
), 0)),
183 vIndices
= Z_EXT(vIndices
, mSimdInt32Ty
);
187 vIndices
= GetSimdValid8bitIndices(indices
, pLastIndex
);
191 if (fetchState
.bDisableIndexOOBCheck
)
194 BITCAST(indices
, PointerType::get(VectorType::get(mInt16Ty
, mpJitMgr
->mVWidth
), 0)),
196 vIndices
= Z_EXT(vIndices
, mSimdInt32Ty
);
200 vIndices
= GetSimdValid16bitIndices(indices
, pLastIndex
);
204 (fetchState
.bDisableIndexOOBCheck
)
205 ? vIndices
= LOAD(indices
,
207 PointerType::get(mSimdInt32Ty
, 0),
208 MEM_CLIENT::GFX_MEM_CLIENT_FETCH
)
209 : vIndices
= GetSimdValid32bitIndices(indices
, pLastIndex
);
210 break; // incoming type is already 32bit int
212 SWR_INVALID("Unsupported index type");
217 if (fetchState
.bForceSequentialAccessEnable
)
219 Value
* pOffsets
= mVWidth
== 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
220 : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
222 // VertexData buffers are accessed sequentially, the index is equal to the vertex number
223 vIndices
= VBROADCAST(LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_StartVertex
}));
224 vIndices
= ADD(vIndices
, pOffsets
);
227 Value
* vVertexId
= vIndices
;
228 if (fetchState
.bVertexIDOffsetEnable
)
230 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
232 Value
* vBaseVertex
= VBROADCAST(LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_BaseVertex
}));
233 Value
* vStartVertex
= VBROADCAST(LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_StartVertex
}));
234 vVertexId
= ADD(vIndices
, vBaseVertex
);
235 vVertexId
= ADD(vVertexId
, vStartVertex
);
238 // store out vertex IDs
241 // store out in simd8 halves until core supports 16-wide natively
242 auto vVertexIdLo
= EXTRACT_16(vVertexId
, 0);
243 auto vVertexIdHi
= EXTRACT_16(vVertexId
, 1);
244 STORE(vVertexIdLo
, GEP(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_VertexID
}));
245 STORE(vVertexIdHi
, GEP(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_VertexID2
}));
247 else if (mVWidth
== 8)
249 STORE(vVertexId
, GEP(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_VertexID
}));
252 // store out cut mask if enabled
253 if (fetchState
.bEnableCutIndex
)
255 Value
* vCutIndex
= VIMMED1(fetchState
.cutIndex
);
256 Value
* cutMask
= VMASK(ICMP_EQ(vIndices
, vCutIndex
));
260 auto cutMaskLo
= EXTRACT_16(cutMask
, 0);
261 auto cutMaskHi
= EXTRACT_16(cutMask
, 1);
262 STORE(cutMaskLo
, GEP(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_CutMask
}));
263 STORE(cutMaskHi
, GEP(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_CutMask2
}));
265 else if (mVWidth
== 8)
267 STORE(cutMask
, GEP(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_CutMask
}));
271 // Fetch attributes from memory and output to a simdvertex struct
272 JitGatherVertices(fetchState
, streams
, vIndices
, pVtxOut
);
276 JitManager::DumpToFile(fetch
, "src");
279 verifyFunction(*fetch
);
282 ::FunctionPassManager
setupPasses(JM()->mpCurrentModule
);
284 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
285 setupPasses
.add(createBreakCriticalEdgesPass());
286 setupPasses
.add(createCFGSimplificationPass());
287 setupPasses
.add(createEarlyCSEPass());
288 setupPasses
.add(createPromoteMemoryToRegisterPass());
290 setupPasses
.run(*fetch
);
292 JitManager::DumpToFile(fetch
, "se");
294 ::FunctionPassManager
optPasses(JM()->mpCurrentModule
);
296 ///@todo Haven't touched these either. Need to remove some of these and add others.
297 optPasses
.add(createCFGSimplificationPass());
298 optPasses
.add(createEarlyCSEPass());
299 optPasses
.add(createInstructionCombiningPass());
300 optPasses
.add(createConstantPropagationPass());
301 optPasses
.add(createSCCPPass());
302 optPasses
.add(createAggressiveDCEPass());
304 optPasses
.run(*fetch
);
306 optPasses
.add(createLowerX86Pass(this));
307 optPasses
.run(*fetch
);
309 JitManager::DumpToFile(fetch
, "opt");
312 // Revert 16-wide override
313 #if USE_SIMD16_SHADERS
314 SetTargetWidth(baseWidth
);
320 // returns true for odd formats that require special state.gather handling
321 bool FetchJit::IsOddFormat(SWR_FORMAT format
)
323 const SWR_FORMAT_INFO
& info
= GetFormatInfo(format
);
324 if (info
.bpc
[0] != 8 && info
.bpc
[0] != 16 && info
.bpc
[0] != 32 && info
.bpc
[0] != 64)
331 // format is uniform if all components are the same size and type
332 bool FetchJit::IsUniformFormat(SWR_FORMAT format
)
334 const SWR_FORMAT_INFO
& info
= GetFormatInfo(format
);
335 uint32_t bpc0
= info
.bpc
[0];
336 uint32_t type0
= info
.type
[0];
338 for (uint32_t c
= 1; c
< info
.numComps
; ++c
)
340 if (bpc0
!= info
.bpc
[c
] || type0
!= info
.type
[c
])
348 // unpacks components based on format
349 // foreach component in the pixel
350 // mask off everything but this component
351 // shift component to LSB
352 void FetchJit::UnpackComponents(SWR_FORMAT format
, Value
* vInput
, Value
* result
[4])
354 const SWR_FORMAT_INFO
& info
= GetFormatInfo(format
);
356 uint32_t bitOffset
= 0;
357 for (uint32_t c
= 0; c
< info
.numComps
; ++c
)
359 uint32_t swizzledIndex
= info
.swizzle
[c
];
360 uint32_t compBits
= info
.bpc
[c
];
361 uint32_t bitmask
= ((1 << compBits
) - 1) << bitOffset
;
362 Value
* comp
= AND(vInput
, bitmask
);
363 comp
= LSHR(comp
, bitOffset
);
365 result
[swizzledIndex
] = comp
;
366 bitOffset
+= compBits
;
370 // gather for odd component size formats
371 // gather SIMD full pixels per lane then shift/mask to move each component to their
373 void FetchJit::CreateGatherOddFormats(
374 SWR_FORMAT format
, Value
* pMask
, Value
* xpBase
, Value
* pOffsets
, Value
* pResult
[4])
376 const SWR_FORMAT_INFO
& info
= GetFormatInfo(format
);
378 // only works if pixel size is <= 32bits
379 SWR_ASSERT(info
.bpp
<= 32);
385 GATHERDD(VIMMED1(0), xpBase
, pOffsets
, pMask
, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH
);
389 // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
390 Value
* pMem
= ALLOCA(mSimdInt32Ty
);
391 STORE(VIMMED1(0u), pMem
);
393 Value
* pDstMem
= POINTER_CAST(pMem
, mInt32PtrTy
);
395 for (uint32_t lane
= 0; lane
< mVWidth
; ++lane
)
398 Value
* index
= VEXTRACT(pOffsets
, C(lane
));
399 Value
* mask
= VEXTRACT(pMask
, C(lane
));
401 // use branch around load based on mask
402 // Needed to avoid page-faults on unmasked lanes
403 BasicBlock
* pCurrentBB
= IRB()->GetInsertBlock();
404 BasicBlock
* pMaskedLoadBlock
=
405 BasicBlock::Create(JM()->mContext
, "MaskedLaneLoad", pCurrentBB
->getParent());
406 BasicBlock
* pEndLoadBB
=
407 BasicBlock::Create(JM()->mContext
, "AfterMaskedLoad", pCurrentBB
->getParent());
409 COND_BR(mask
, pMaskedLoadBlock
, pEndLoadBB
);
411 JM()->mBuilder
.SetInsertPoint(pMaskedLoadBlock
);
417 Value
* pDst
= BITCAST(GEP(pDstMem
, C(lane
)), PointerType::get(mInt8Ty
, 0));
418 Value
* xpSrc
= ADD(xpBase
, Z_EXT(index
, xpBase
->getType()));
419 STORE(LOAD(xpSrc
, "", mInt8PtrTy
, MEM_CLIENT::GFX_MEM_CLIENT_FETCH
), pDst
);
425 Value
* pDst
= BITCAST(GEP(pDstMem
, C(lane
)), PointerType::get(mInt16Ty
, 0));
426 Value
* xpSrc
= ADD(xpBase
, Z_EXT(index
, xpBase
->getType()));
427 STORE(LOAD(xpSrc
, "", mInt16PtrTy
, MEM_CLIENT::GFX_MEM_CLIENT_FETCH
), pDst
);
434 // First 16-bits of data
435 Value
* pDst
= BITCAST(GEP(pDstMem
, C(lane
)), PointerType::get(mInt16Ty
, 0));
436 Value
* xpSrc
= ADD(xpBase
, Z_EXT(index
, xpBase
->getType()));
437 STORE(LOAD(xpSrc
, "", mInt16PtrTy
, MEM_CLIENT::GFX_MEM_CLIENT_FETCH
), pDst
);
439 // Last 8-bits of data
440 pDst
= BITCAST(GEP(pDst
, C(1)), PointerType::get(mInt8Ty
, 0));
441 xpSrc
= ADD(xpSrc
, C((int64_t)2));
442 STORE(LOAD(xpSrc
, "", mInt8PtrTy
, MEM_CLIENT::GFX_MEM_CLIENT_FETCH
), pDst
);
447 SWR_INVALID("Shouldn't have BPP = %d now", info
.bpp
);
452 JM()->mBuilder
.SetInsertPoint(pEndLoadBB
);
455 pGather
= LOAD(pMem
);
458 for (uint32_t comp
= 0; comp
< 4; ++comp
)
460 pResult
[comp
] = VIMMED1((int)info
.defaults
[comp
]);
463 UnpackComponents(format
, pGather
, pResult
);
466 pResult
[0] = BITCAST(pResult
[0], mSimdFP32Ty
);
467 pResult
[1] = BITCAST(pResult
[1], mSimdFP32Ty
);
468 pResult
[2] = BITCAST(pResult
[2], mSimdFP32Ty
);
469 pResult
[3] = BITCAST(pResult
[3], mSimdFP32Ty
);
472 void FetchJit::ConvertFormat(SWR_FORMAT format
, Value
* texels
[4])
474 const SWR_FORMAT_INFO
& info
= GetFormatInfo(format
);
476 for (uint32_t c
= 0; c
< info
.numComps
; ++c
)
478 uint32_t compIndex
= info
.swizzle
[c
];
480 // skip any conversion on UNUSED components
481 if (info
.type
[c
] == SWR_TYPE_UNUSED
)
486 if (info
.isNormalized
[c
])
488 if (info
.type
[c
] == SWR_TYPE_SNORM
)
490 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
493 /// result = c * (1.0f / (2^(n-1) - 1);
494 uint32_t n
= info
.bpc
[c
];
495 uint32_t pow2
= 1 << (n
- 1);
496 float scale
= 1.0f
/ (float)(pow2
- 1);
497 Value
* vScale
= VIMMED1(scale
);
498 texels
[compIndex
] = BITCAST(texels
[compIndex
], mSimdInt32Ty
);
499 texels
[compIndex
] = SI_TO_FP(texels
[compIndex
], mSimdFP32Ty
);
500 texels
[compIndex
] = FMUL(texels
[compIndex
], vScale
);
504 SWR_ASSERT(info
.type
[c
] == SWR_TYPE_UNORM
);
506 /// result = c * (1.0f / (2^n - 1))
507 uint32_t n
= info
.bpc
[c
];
508 uint32_t pow2
= 1 << n
;
509 // special case 24bit unorm format, which requires a full divide to meet ULP
513 float scale
= (float)(pow2
- 1);
514 Value
* vScale
= VIMMED1(scale
);
515 texels
[compIndex
] = BITCAST(texels
[compIndex
], mSimdInt32Ty
);
516 texels
[compIndex
] = SI_TO_FP(texels
[compIndex
], mSimdFP32Ty
);
517 texels
[compIndex
] = FDIV(texels
[compIndex
], vScale
);
521 float scale
= 1.0f
/ (float)(pow2
- 1);
522 Value
* vScale
= VIMMED1(scale
);
523 texels
[compIndex
] = BITCAST(texels
[compIndex
], mSimdInt32Ty
);
524 texels
[compIndex
] = UI_TO_FP(texels
[compIndex
], mSimdFP32Ty
);
525 texels
[compIndex
] = FMUL(texels
[compIndex
], vScale
);
533 //////////////////////////////////////////////////////////////////////////
534 /// @brief Loads attributes from memory using AVX2 GATHER(s)
535 /// @param fetchState - info about attributes to be fetched from memory
536 /// @param streams - value pointer to the current vertex stream
537 /// @param vIndices - vector value of indices to gather
538 /// @param pVtxOut - value pointer to output simdvertex struct
539 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE
& fetchState
,
544 uint32_t currentVertexElement
= 0;
545 uint32_t outputElt
= 0;
546 Value
* vVertexElements
[4];
548 Value
* startVertex
= LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_StartVertex
});
549 Value
* startInstance
= LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_StartInstance
});
550 Value
* curInstance
= LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_CurInstance
});
551 Value
* vBaseVertex
= VBROADCAST(LOAD(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_BaseVertex
}));
552 curInstance
->setName("curInstance");
554 for (uint32_t nInputElt
= 0; nInputElt
< fetchState
.numAttribs
; nInputElt
+= 1)
556 const INPUT_ELEMENT_DESC
& ied
= fetchState
.layout
[nInputElt
];
558 // skip element if all components are disabled
559 if (ied
.ComponentPacking
== ComponentEnable::NONE
)
564 const SWR_FORMAT_INFO
& info
= GetFormatInfo((SWR_FORMAT
)ied
.Format
);
565 SWR_ASSERT((info
.bpp
!= 0), "Unsupported format in JitGatherVertices.");
568 info
.numComps
; ///@todo Code below assumes all components are same size. Need to fix.
570 Value
* stream
= LOAD(streams
, {ied
.StreamIndex
, SWR_VERTEX_BUFFER_STATE_xpData
});
572 Value
* stride
= LOAD(streams
, {ied
.StreamIndex
, SWR_VERTEX_BUFFER_STATE_pitch
});
573 Value
* vStride
= VBROADCAST(stride
);
575 // max vertex index that is fully in bounds
576 Value
* maxVertex
= GEP(streams
, {C(ied
.StreamIndex
), C(SWR_VERTEX_BUFFER_STATE_maxVertex
)});
577 maxVertex
= LOAD(maxVertex
);
579 Value
* minVertex
= NULL
;
580 if (fetchState
.bPartialVertexBuffer
)
582 // min vertex index for low bounds OOB checking
583 minVertex
= GEP(streams
, {C(ied
.StreamIndex
), C(SWR_VERTEX_BUFFER_STATE_minVertex
)});
584 minVertex
= LOAD(minVertex
);
587 if (fetchState
.bInstanceIDOffsetEnable
)
589 // the InstanceID (curInstance) value is offset by StartInstanceLocation
590 curInstance
= ADD(curInstance
, startInstance
);
595 Value
* vInstanceStride
= VIMMED1(0);
597 if (ied
.InstanceEnable
)
599 Value
* stepRate
= C(ied
.InstanceAdvancementState
);
601 // prevent a div by 0 for 0 step rate
602 Value
* isNonZeroStep
= ICMP_UGT(stepRate
, C(0));
603 stepRate
= SELECT(isNonZeroStep
, stepRate
, C(1));
605 // calc the current offset into instanced data buffer
606 Value
* calcInstance
= UDIV(curInstance
, stepRate
);
608 // if step rate is 0, every instance gets instance 0
609 calcInstance
= SELECT(isNonZeroStep
, calcInstance
, C(0));
611 vCurIndices
= VBROADCAST(calcInstance
);
612 startOffset
= startInstance
;
614 else if (ied
.InstanceStrideEnable
)
616 // grab the instance advancement state, determines stride in bytes from one instance to
618 Value
* stepRate
= C(ied
.InstanceAdvancementState
);
619 vInstanceStride
= VBROADCAST(MUL(curInstance
, stepRate
));
621 // offset indices by baseVertex
622 vCurIndices
= ADD(vIndices
, vBaseVertex
);
624 startOffset
= startVertex
;
625 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
629 // offset indices by baseVertex
630 vCurIndices
= ADD(vIndices
, vBaseVertex
);
631 startOffset
= startVertex
;
634 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
635 // do 64bit address offset calculations.
637 // calculate byte offset to the start of the VB
638 Value
* baseOffset
= MUL(Z_EXT(startOffset
, mInt64Ty
), Z_EXT(stride
, mInt64Ty
));
640 // VGATHER* takes an *i8 src pointer so that's what stream is
641 Value
* pStreamBaseGFX
= ADD(stream
, baseOffset
);
643 // if we have a start offset, subtract from max vertex. Used for OOB check
644 maxVertex
= SUB(Z_EXT(maxVertex
, mInt64Ty
), Z_EXT(startOffset
, mInt64Ty
));
645 Value
* maxNeg
= ICMP_SLT(maxVertex
, C((int64_t)0));
646 // if we have a negative value, we're already OOB. clamp at 0.
647 maxVertex
= SELECT(maxNeg
, C(0), TRUNC(maxVertex
, mInt32Ty
));
649 if (fetchState
.bPartialVertexBuffer
)
651 // similary for min vertex
652 minVertex
= SUB(Z_EXT(minVertex
, mInt64Ty
), Z_EXT(startOffset
, mInt64Ty
));
653 Value
* minNeg
= ICMP_SLT(minVertex
, C((int64_t)0));
654 minVertex
= SELECT(minNeg
, C(0), TRUNC(minVertex
, mInt32Ty
));
657 // Load the in bounds size of a partially valid vertex
658 Value
* partialInboundsSize
=
659 GEP(streams
, {C(ied
.StreamIndex
), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize
)});
660 partialInboundsSize
= LOAD(partialInboundsSize
);
661 Value
* vPartialVertexSize
= VBROADCAST(partialInboundsSize
);
662 Value
* vBpp
= VBROADCAST(C(info
.Bpp
));
663 Value
* vAlignmentOffsets
= VBROADCAST(C(ied
.AlignedByteOffset
));
665 // is the element is <= the partially valid size
666 Value
* vElementInBoundsMask
= ICMP_SLE(vBpp
, SUB(vPartialVertexSize
, vAlignmentOffsets
));
668 // override cur indices with 0 if pitch is 0
669 Value
* pZeroPitchMask
= ICMP_EQ(vStride
, VIMMED1(0));
670 vCurIndices
= SELECT(pZeroPitchMask
, VIMMED1(0), vCurIndices
);
672 // are vertices partially OOB?
673 Value
* vMaxVertex
= VBROADCAST(maxVertex
);
674 Value
* vPartialOOBMask
= ICMP_EQ(vCurIndices
, vMaxVertex
);
676 // are vertices fully in bounds?
677 Value
* vMaxGatherMask
= ICMP_ULT(vCurIndices
, vMaxVertex
);
680 if (fetchState
.bPartialVertexBuffer
)
682 // are vertices below minVertex limit?
683 Value
* vMinVertex
= VBROADCAST(minVertex
);
684 Value
* vMinGatherMask
= ICMP_UGE(vCurIndices
, vMinVertex
);
686 // only fetch lanes that pass both tests
687 vGatherMask
= AND(vMaxGatherMask
, vMinGatherMask
);
691 vGatherMask
= vMaxGatherMask
;
694 // blend in any partially OOB indices that have valid elements
695 vGatherMask
= SELECT(vPartialOOBMask
, vElementInBoundsMask
, vGatherMask
);
697 // calculate the actual offsets into the VB
698 Value
* vOffsets
= MUL(vCurIndices
, vStride
);
699 vOffsets
= ADD(vOffsets
, vAlignmentOffsets
);
701 // if instance stride enable is:
702 // true - add product of the instanceID and advancement state to the offst into the VB
703 // false - value of vInstanceStride has been initialialized to zero
704 vOffsets
= ADD(vOffsets
, vInstanceStride
);
706 // Packing and component control
707 ComponentEnable compMask
= (ComponentEnable
)ied
.ComponentPacking
;
708 const ComponentControl compCtrl
[4]{(ComponentControl
)ied
.ComponentControl0
,
709 (ComponentControl
)ied
.ComponentControl1
,
710 (ComponentControl
)ied
.ComponentControl2
,
711 (ComponentControl
)ied
.ComponentControl3
};
713 // Special gather/conversion for formats without equal component sizes
714 if (IsOddFormat((SWR_FORMAT
)ied
.Format
))
717 CreateGatherOddFormats(
718 (SWR_FORMAT
)ied
.Format
, vGatherMask
, pStreamBaseGFX
, vOffsets
, pResults
);
719 ConvertFormat((SWR_FORMAT
)ied
.Format
, pResults
);
721 for (uint32_t c
= 0; c
< 4; c
+= 1)
723 if (isComponentEnabled(compMask
, c
))
725 vVertexElements
[currentVertexElement
++] = pResults
[c
];
726 if (currentVertexElement
> 3)
728 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
729 // reset to the next vVertexElement to output
730 currentVertexElement
= 0;
735 else if (info
.type
[0] == SWR_TYPE_FLOAT
)
737 ///@todo: support 64 bit vb accesses
738 Value
* gatherSrc
= VIMMED1(0.0f
);
740 SWR_ASSERT(IsUniformFormat((SWR_FORMAT
)ied
.Format
),
741 "Unsupported format for standard gather fetch.");
743 // Gather components from memory to store in a simdvertex structure
748 Value
* vGatherResult
[2];
750 // if we have at least one component out of x or y to fetch
751 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 1))
753 vGatherResult
[0] = GATHERPS(gatherSrc
, pStreamBaseGFX
, vOffsets
, vGatherMask
, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH
);
754 // e.g. result of first 8x32bit integer gather for 16bit components
755 // 256i - 0 1 2 3 4 5 6 7
756 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
760 // if we have at least one component out of z or w to fetch
761 if (isComponentEnabled(compMask
, 2) || isComponentEnabled(compMask
, 3))
763 // offset base to the next components(zw) in the vertex to gather
764 pStreamBaseGFX
= ADD(pStreamBaseGFX
, C((int64_t)4));
766 vGatherResult
[1] = GATHERPS(gatherSrc
, pStreamBaseGFX
, vOffsets
, vGatherMask
, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH
);
767 // e.g. result of second 8x32bit integer gather for 16bit components
768 // 256i - 0 1 2 3 4 5 6 7
769 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
773 // if we have at least one component to shuffle into place
776 Shuffle16bpcArgs args
= std::forward_as_tuple(vGatherResult
,
778 Instruction::CastOps::FPExt
,
780 currentVertexElement
,
786 // Shuffle gathered components into place in simdvertex struct
787 mVWidth
== 16 ? Shuffle16bpcGather16(args
)
788 : Shuffle16bpcGather(args
); // outputs to vVertexElements ref
794 for (uint32_t i
= 0; i
< 4; i
+= 1)
796 if (isComponentEnabled(compMask
, i
))
798 // if we need to gather the component
799 if (compCtrl
[i
] == StoreSrc
)
801 // Gather a SIMD of vertices
802 // APIs allow a 4GB range for offsets
803 // However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :(
804 // Add 2GB to the base pointer and 2GB to the offsets. This makes
805 // "negative" (large) offsets into positive offsets and small offsets
806 // into negative offsets.
807 Value
* vNewOffsets
= ADD(vOffsets
, VIMMED1(0x80000000));
808 vVertexElements
[currentVertexElement
++] =
810 ADD(pStreamBaseGFX
, C((uintptr_t)0x80000000U
)),
814 MEM_CLIENT::GFX_MEM_CLIENT_FETCH
);
818 vVertexElements
[currentVertexElement
++] =
819 GenerateCompCtrlVector(compCtrl
[i
]);
822 if (currentVertexElement
> 3)
824 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
825 // reset to the next vVertexElement to output
826 currentVertexElement
= 0;
830 // offset base to the next component in the vertex to gather
831 pStreamBaseGFX
= ADD(pStreamBaseGFX
, C((int64_t)4));
837 for (uint32_t i
= 0; i
< 4; i
+= 1)
839 if (isComponentEnabled(compMask
, i
))
841 // if we need to gather the component
842 if (compCtrl
[i
] == StoreSrc
)
850 vShufLo
= C({0, 1, 2, 3});
851 vShufHi
= C({4, 5, 6, 7});
852 vShufAll
= C({0, 1, 2, 3, 4, 5, 6, 7});
856 SWR_ASSERT(mVWidth
== 16);
857 vShufLo
= C({0, 1, 2, 3, 4, 5, 6, 7});
858 vShufHi
= C({8, 9, 10, 11, 12, 13, 14, 15});
860 C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
863 Value
* vMaskLo
= VSHUFFLE(vGatherMask
, vGatherMask
, vShufLo
);
864 Value
* vMaskHi
= VSHUFFLE(vGatherMask
, vGatherMask
, vShufHi
);
866 Value
* vOffsetsLo
= VSHUFFLE(vOffsets
, vOffsets
, vShufLo
);
867 Value
* vOffsetsHi
= VSHUFFLE(vOffsets
, vOffsets
, vShufHi
);
869 Value
* vZeroDouble
= VECTOR_SPLAT(
870 mVWidth
/ 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f
));
873 GATHERPD(vZeroDouble
, pStreamBaseGFX
, vOffsetsLo
, vMaskLo
);
875 GATHERPD(vZeroDouble
, pStreamBaseGFX
, vOffsetsHi
, vMaskHi
);
877 pGatherLo
= VCVTPD2PS(pGatherLo
);
878 pGatherHi
= VCVTPD2PS(pGatherHi
);
880 Value
* pGather
= VSHUFFLE(pGatherLo
, pGatherHi
, vShufAll
);
882 vVertexElements
[currentVertexElement
++] = pGather
;
886 vVertexElements
[currentVertexElement
++] =
887 GenerateCompCtrlVector(compCtrl
[i
]);
890 if (currentVertexElement
> 3)
892 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
893 // reset to the next vVertexElement to output
894 currentVertexElement
= 0;
898 // offset base to the next component in the vertex to gather
899 pStreamBaseGFX
= ADD(pStreamBaseGFX
, C((int64_t)8));
904 SWR_INVALID("Tried to fetch invalid FP format");
910 Instruction::CastOps extendCastType
= Instruction::CastOps::CastOpsEnd
;
911 ConversionType conversionType
= CONVERT_NONE
;
913 SWR_ASSERT(IsUniformFormat((SWR_FORMAT
)ied
.Format
),
914 "Unsupported format for standard gather fetch.");
916 switch (info
.type
[0])
919 conversionType
= CONVERT_NORMALIZED
;
921 extendCastType
= Instruction::CastOps::ZExt
;
924 conversionType
= CONVERT_NORMALIZED
;
926 extendCastType
= Instruction::CastOps::SExt
;
928 case SWR_TYPE_USCALED
:
929 conversionType
= CONVERT_USCALED
;
930 extendCastType
= Instruction::CastOps::UIToFP
;
932 case SWR_TYPE_SSCALED
:
933 conversionType
= CONVERT_SSCALED
;
934 extendCastType
= Instruction::CastOps::SIToFP
;
936 case SWR_TYPE_SFIXED
:
937 conversionType
= CONVERT_SFIXED
;
938 extendCastType
= Instruction::CastOps::SExt
;
944 // value substituted when component of gather is masked
945 Value
* gatherSrc
= VIMMED1(0);
947 // Gather components from memory to store in a simdvertex structure
952 // if we have at least one component to fetch
955 Value
* vGatherResult
= GATHERDD(gatherSrc
,
960 MEM_CLIENT::GFX_MEM_CLIENT_FETCH
);
961 // e.g. result of an 8x32bit integer gather for 8bit components
962 // 256i - 0 1 2 3 4 5 6 7
963 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
965 Shuffle8bpcArgs args
= std::forward_as_tuple(vGatherResult
,
969 currentVertexElement
,
976 // Shuffle gathered components into place in simdvertex struct
977 mVWidth
== 16 ? Shuffle8bpcGatherd16(args
)
978 : Shuffle8bpcGatherd(args
); // outputs to vVertexElements ref
984 Value
* vGatherResult
[2];
986 // if we have at least one component out of x or y to fetch
987 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 1))
989 vGatherResult
[0] = GATHERDD(gatherSrc
,
994 MEM_CLIENT::GFX_MEM_CLIENT_FETCH
);
995 // e.g. result of first 8x32bit integer gather for 16bit components
996 // 256i - 0 1 2 3 4 5 6 7
997 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1001 // if we have at least one component out of z or w to fetch
1002 if (isComponentEnabled(compMask
, 2) || isComponentEnabled(compMask
, 3))
1004 // offset base to the next components(zw) in the vertex to gather
1005 pStreamBaseGFX
= ADD(pStreamBaseGFX
, C((int64_t)4));
1007 vGatherResult
[1] = GATHERDD(gatherSrc
,
1012 MEM_CLIENT::GFX_MEM_CLIENT_FETCH
);
1013 // e.g. result of second 8x32bit integer gather for 16bit components
1014 // 256i - 0 1 2 3 4 5 6 7
1015 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1019 // if we have at least one component to shuffle into place
1022 Shuffle16bpcArgs args
= std::forward_as_tuple(vGatherResult
,
1026 currentVertexElement
,
1032 // Shuffle gathered components into place in simdvertex struct
1033 mVWidth
== 16 ? Shuffle16bpcGather16(args
)
1034 : Shuffle16bpcGather(args
); // outputs to vVertexElements ref
1040 // Gathered components into place in simdvertex struct
1041 for (uint32_t i
= 0; i
< 4; i
++)
1043 if (isComponentEnabled(compMask
, i
))
1045 // if we need to gather the component
1046 if (compCtrl
[i
] == StoreSrc
)
1048 Value
* pGather
= GATHERDD(gatherSrc
,
1053 MEM_CLIENT::GFX_MEM_CLIENT_FETCH
);
1055 if (conversionType
== CONVERT_USCALED
)
1057 pGather
= UI_TO_FP(pGather
, mSimdFP32Ty
);
1059 else if (conversionType
== CONVERT_SSCALED
)
1061 pGather
= SI_TO_FP(pGather
, mSimdFP32Ty
);
1063 else if (conversionType
== CONVERT_SFIXED
)
1065 pGather
= FMUL(SI_TO_FP(pGather
, mSimdFP32Ty
),
1066 VBROADCAST(C(1 / 65536.0f
)));
1069 vVertexElements
[currentVertexElement
++] = pGather
;
1071 // e.g. result of a single 8x32bit integer gather for 32bit components
1072 // 256i - 0 1 2 3 4 5 6 7
1073 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1077 vVertexElements
[currentVertexElement
++] =
1078 GenerateCompCtrlVector(compCtrl
[i
]);
1081 if (currentVertexElement
> 3)
1083 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1085 // reset to the next vVertexElement to output
1086 currentVertexElement
= 0;
1090 // offset base to the next component in the vertex to gather
1091 pStreamBaseGFX
= ADD(pStreamBaseGFX
, C((int64_t)4));
1099 // if we have a partially filled vVertexElement struct, output it
1100 if (currentVertexElement
> 0)
1102 StoreVertexElements(pVtxOut
, outputElt
++, currentVertexElement
, vVertexElements
);
1106 template <typename T
>
1107 Value
* FetchJit::GetSimdValidIndicesHelper(Value
* pIndices
, Value
* pLastIndex
)
1109 SWR_ASSERT(pIndices
->getType() == mInt64Ty
&& pLastIndex
->getType() == mInt64Ty
,
1110 "Function expects gfxptr_t for both input parameters.");
1114 static_assert(sizeof(T
) == sizeof(uint16_t) || sizeof(T
) == sizeof(uint8_t),
1115 "Unsupported type for use with GetSimdValidIndicesHelper<T>");
1116 constexpr bool bSize
= (sizeof(T
) == sizeof(uint16_t));
1121 else if (sizeof(T
) == sizeof(uint8_t))
1127 SWR_ASSERT(false, "This should never happen as per static_assert above.");
1130 Value
* vIndices
= VUNDEF_I();
1133 // store 0 index on stack to be used to conditionally load from if index address is OOB
1134 Value
* pZeroIndex
= ALLOCA(Ty
->getPointerElementType());
1135 STORE(C((T
)0), pZeroIndex
);
1137 // Load a SIMD of index pointers
1138 for (int64_t lane
= 0; lane
< mVWidth
; lane
++)
1140 // Calculate the address of the requested index
1141 Value
* pIndex
= GEP(pIndices
, C(lane
), Ty
);
1143 pLastIndex
= INT_TO_PTR(pLastIndex
, Ty
);
1145 // check if the address is less than the max index,
1146 Value
* mask
= ICMP_ULT(pIndex
, pLastIndex
);
1148 // if valid, load the index. if not, load 0 from the stack
1149 Value
* pValid
= SELECT(mask
, pIndex
, pZeroIndex
);
1150 Value
* index
= LOAD(pValid
, "valid index", Ty
, MEM_CLIENT::GFX_MEM_CLIENT_FETCH
);
1152 // zero extended index to 32 bits and insert into the correct simd lane
1153 index
= Z_EXT(index
, mInt32Ty
);
1154 vIndices
= VINSERT(vIndices
, index
, lane
);
1161 //////////////////////////////////////////////////////////////////////////
1162 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1163 /// *Note* have to do 8bit index checking in scalar until we have AVX-512
1165 /// @param pIndices - pointer to 8 bit indices
1166 /// @param pLastIndex - pointer to last valid index
1167 Value
* FetchJit::GetSimdValid8bitIndices(Value
* pIndices
, Value
* pLastIndex
)
1169 return GetSimdValidIndicesHelper
<uint8_t>(pIndices
, pLastIndex
);
1172 //////////////////////////////////////////////////////////////////////////
1173 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1174 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1176 /// @param pIndices - pointer to 16 bit indices
1177 /// @param pLastIndex - pointer to last valid index
1178 Value
* FetchJit::GetSimdValid16bitIndices(Value
* pIndices
, Value
* pLastIndex
)
1180 return GetSimdValidIndicesHelper
<uint16_t>(pIndices
, pLastIndex
);
1183 //////////////////////////////////////////////////////////////////////////
1184 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1185 /// @param pIndices - pointer to 32 bit indices
1186 /// @param pLastIndex - pointer to last valid index
1187 Value
* FetchJit::GetSimdValid32bitIndices(Value
* pIndices
, Value
* pLastIndex
)
1189 DataLayout
dL(JM()->mpCurrentModule
);
1190 Value
* iLastIndex
= pLastIndex
;
1191 Value
* iIndices
= pIndices
;
1193 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1194 Value
* numIndicesLeft
= SUB(iLastIndex
, iIndices
);
1195 numIndicesLeft
= TRUNC(numIndicesLeft
, mInt32Ty
);
1196 numIndicesLeft
= SDIV(numIndicesLeft
, C(4));
1198 // create a vector of index counts from the base index ptr passed into the fetch
1199 Constant
* vIndexOffsets
;
1202 vIndexOffsets
= C({0, 1, 2, 3, 4, 5, 6, 7});
1206 vIndexOffsets
= C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1209 // compare index count to the max valid index
1210 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1211 // vIndexOffsets 0 1 2 3 4 5 6 7
1212 // ------------------------------
1213 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1214 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1215 Value
* vMaxIndex
= VBROADCAST(numIndicesLeft
);
1216 Value
* vIndexMask
= ICMP_SGT(vMaxIndex
, vIndexOffsets
);
1218 // Load the indices; OOB loads 0
1219 return MASKED_LOAD(pIndices
,
1224 PointerType::get(mSimdInt32Ty
, 0),
1225 MEM_CLIENT::GFX_MEM_CLIENT_FETCH
);
1228 //////////////////////////////////////////////////////////////////////////
1229 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1230 /// denormalizes if needed, converts to F32 if needed, and positions in
1231 // the proper SIMD rows to be output to the simdvertex structure
1232 /// @param args: (tuple of args, listed below)
1233 /// @param vGatherResult - 8 gathered 8bpc vertices
1234 /// @param pVtxOut - base pointer to output simdvertex struct
1235 /// @param extendType - sign extend or zero extend
1236 /// @param bNormalized - do we need to denormalize?
1237 /// @param currentVertexElement - reference to the current vVertexElement
1238 /// @param outputElt - reference to the current offset from simdvertex we're o
1239 /// @param compMask - component packing mask
1240 /// @param compCtrl - component control val
1241 /// @param vVertexElements[4] - vertex components to output
1242 /// @param swizzle[4] - component swizzle location
1243 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs
& args
)
1245 // Unpack tuple args
1246 Value
*& vGatherResult
= std::get
<0>(args
);
1247 Value
* pVtxOut
= std::get
<1>(args
);
1248 const Instruction::CastOps extendType
= std::get
<2>(args
);
1249 const ConversionType conversionType
= std::get
<3>(args
);
1250 uint32_t& currentVertexElement
= std::get
<4>(args
);
1251 uint32_t& outputElt
= std::get
<5>(args
);
1252 const ComponentEnable compMask
= std::get
<6>(args
);
1253 const ComponentControl(&compCtrl
)[4] = std::get
<7>(args
);
1254 Value
*(&vVertexElements
)[4] = std::get
<8>(args
);
1255 const uint32_t(&swizzle
)[4] = std::get
<9>(args
);
1258 Type
* vGatherTy
= VectorType::get(mInt32Ty
, 8);
1259 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, 32);
1261 // have to do extra work for sign extending
1262 if ((extendType
== Instruction::CastOps::SExt
) || (extendType
== Instruction::CastOps::SIToFP
))
1264 Type
* v16x8Ty
= VectorType::get(mInt8Ty
, 16); // 8x16bit ints in a 128bit lane
1265 Type
* v128Ty
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), 2);
1267 // shuffle mask, including any swizzling
1268 const char x
= (char)swizzle
[0];
1269 const char y
= (char)swizzle
[1];
1270 const char z
= (char)swizzle
[2];
1271 const char w
= (char)swizzle
[3];
1272 Value
* vConstMask
= C
<char>(
1273 {char(x
), char(x
+ 4), char(x
+ 8), char(x
+ 12), char(y
), char(y
+ 4),
1274 char(y
+ 8), char(y
+ 12), char(z
), char(z
+ 4), char(z
+ 8), char(z
+ 12),
1275 char(w
), char(w
+ 4), char(w
+ 8), char(w
+ 12), char(x
), char(x
+ 4),
1276 char(x
+ 8), char(x
+ 12), char(y
), char(y
+ 4), char(y
+ 8), char(y
+ 12),
1277 char(z
), char(z
+ 4), char(z
+ 8), char(z
+ 12), char(w
), char(w
+ 4),
1278 char(w
+ 8), char(w
+ 12)});
1280 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1282 Value
* vGatherResult_lo
= EXTRACT_16(vGatherResult
, 0);
1283 Value
* vGatherResult_hi
= EXTRACT_16(vGatherResult
, 1);
1285 Value
* vShufResult_lo
=
1286 BITCAST(PSHUFB(BITCAST(vGatherResult_lo
, v32x8Ty
), vConstMask
), vGatherTy
);
1287 Value
* vShufResult_hi
=
1288 BITCAST(PSHUFB(BITCAST(vGatherResult_hi
, v32x8Ty
), vConstMask
), vGatherTy
);
1290 // after pshufb: group components together in each 128bit lane
1291 // 256i - 0 1 2 3 4 5 6 7
1292 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1294 Value
* vi128XY_lo
= nullptr;
1295 Value
* vi128XY_hi
= nullptr;
1296 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 1))
1298 vi128XY_lo
= BITCAST(
1299 VSHUFFLE(vShufResult_lo
, vShufResult_lo
, C
<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1301 vi128XY_hi
= BITCAST(
1302 VSHUFFLE(vShufResult_hi
, vShufResult_hi
, C
<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1305 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1306 // 256i - 0 1 2 3 4 5 6 7
1307 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1310 // do the same for zw components
1311 Value
* vi128ZW_lo
= nullptr;
1312 Value
* vi128ZW_hi
= nullptr;
1313 if (isComponentEnabled(compMask
, 2) || isComponentEnabled(compMask
, 3))
1315 vi128ZW_lo
= BITCAST(
1316 VSHUFFLE(vShufResult_lo
, vShufResult_lo
, C
<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1318 vi128ZW_hi
= BITCAST(
1319 VSHUFFLE(vShufResult_hi
, vShufResult_hi
, C
<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1323 // init denormalize variables if needed
1324 Instruction::CastOps fpCast
;
1325 Value
* conversionFactor
;
1327 switch (conversionType
)
1329 case CONVERT_NORMALIZED
:
1330 fpCast
= Instruction::CastOps::SIToFP
;
1331 conversionFactor
= VIMMED1((float)(1.0 / 127.0));
1333 case CONVERT_SSCALED
:
1334 fpCast
= Instruction::CastOps::SIToFP
;
1335 conversionFactor
= VIMMED1((float)(1.0));
1337 case CONVERT_USCALED
:
1338 SWR_INVALID("Type should not be sign extended!");
1339 conversionFactor
= nullptr;
1342 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1343 conversionFactor
= nullptr;
1347 // sign extend all enabled components. If we have a fill vVertexElements, output to current
1349 for (uint32_t i
= 0; i
< 4; i
++)
1351 if (isComponentEnabled(compMask
, i
))
1353 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1355 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1356 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1357 // if x or y, use vi128XY permute result, else use vi128ZW
1358 Value
* selectedPermute_lo
= (i
< 2) ? vi128XY_lo
: vi128ZW_lo
;
1359 Value
* selectedPermute_hi
= (i
< 2) ? vi128XY_hi
: vi128ZW_hi
;
1363 PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo
, C(lane
)), v16x8Ty
));
1365 PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi
, C(lane
)), v16x8Ty
));
1367 Value
* temp
= JOIN_16(temp_lo
, temp_hi
);
1369 // denormalize if needed
1370 if (conversionType
!= CONVERT_NONE
)
1372 temp
= FMUL(CAST(fpCast
, temp
, mSimdFP32Ty
), conversionFactor
);
1375 vVertexElements
[currentVertexElement
] = temp
;
1377 currentVertexElement
+= 1;
1381 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1384 if (currentVertexElement
> 3)
1386 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1387 // reset to the next vVertexElement to output
1388 currentVertexElement
= 0;
1394 else if ((extendType
== Instruction::CastOps::ZExt
) ||
1395 (extendType
== Instruction::CastOps::UIToFP
))
1397 // init denormalize variables if needed
1398 Instruction::CastOps fpCast
;
1399 Value
* conversionFactor
;
1401 switch (conversionType
)
1403 case CONVERT_NORMALIZED
:
1404 fpCast
= Instruction::CastOps::UIToFP
;
1405 conversionFactor
= VIMMED1((float)(1.0 / 255.0));
1407 case CONVERT_USCALED
:
1408 fpCast
= Instruction::CastOps::UIToFP
;
1409 conversionFactor
= VIMMED1((float)(1.0));
1411 case CONVERT_SSCALED
:
1412 SWR_INVALID("Type should not be zero extended!");
1413 conversionFactor
= nullptr;
1416 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1417 conversionFactor
= nullptr;
1421 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1422 for (uint32_t i
= 0; i
< 4; i
++)
1424 if (isComponentEnabled(compMask
, i
))
1426 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1428 // pshufb masks for each component
1435 C
<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1436 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1441 C
<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1442 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1447 C
<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1448 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1453 C
<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1454 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1457 vConstMask
= nullptr;
1461 Value
* vGatherResult_lo
= EXTRACT_16(vGatherResult
, 0);
1462 Value
* vGatherResult_hi
= EXTRACT_16(vGatherResult
, 1);
1465 BITCAST(PSHUFB(BITCAST(vGatherResult_lo
, v32x8Ty
), vConstMask
), vGatherTy
);
1467 BITCAST(PSHUFB(BITCAST(vGatherResult_hi
, v32x8Ty
), vConstMask
), vGatherTy
);
1469 // after pshufb for x channel
1470 // 256i - 0 1 2 3 4 5 6 7
1471 // x000 x000 x000 x000 x000 x000 x000 x000
1473 Value
* temp
= JOIN_16(temp_lo
, temp_hi
);
1475 // denormalize if needed
1476 if (conversionType
!= CONVERT_NONE
)
1478 temp
= FMUL(CAST(fpCast
, temp
, mSimdFP32Ty
), conversionFactor
);
1481 vVertexElements
[currentVertexElement
] = temp
;
1483 currentVertexElement
+= 1;
1487 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1490 if (currentVertexElement
> 3)
1492 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1493 // reset to the next vVertexElement to output
1494 currentVertexElement
= 0;
1501 SWR_INVALID("Unsupported conversion type");
1505 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs
& args
)
1507 // Unpack tuple args
1508 Value
*& vGatherResult
= std::get
<0>(args
);
1509 Value
* pVtxOut
= std::get
<1>(args
);
1510 const Instruction::CastOps extendType
= std::get
<2>(args
);
1511 const ConversionType conversionType
= std::get
<3>(args
);
1512 uint32_t& currentVertexElement
= std::get
<4>(args
);
1513 uint32_t& outputElt
= std::get
<5>(args
);
1514 const ComponentEnable compMask
= std::get
<6>(args
);
1515 const ComponentControl(&compCtrl
)[4] = std::get
<7>(args
);
1516 Value
*(&vVertexElements
)[4] = std::get
<8>(args
);
1517 const uint32_t(&swizzle
)[4] = std::get
<9>(args
);
1520 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
1522 for (uint32_t i
= 0; i
< 4; i
++)
1524 if (!isComponentEnabled(compMask
, i
))
1527 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1529 std::vector
<uint32_t> vShuffleMasks
[4] = {
1530 {0, 4, 8, 12, 16, 20, 24, 28}, // x
1531 {1, 5, 9, 13, 17, 21, 25, 29}, // y
1532 {2, 6, 10, 14, 18, 22, 26, 30}, // z
1533 {3, 7, 11, 15, 19, 23, 27, 31}, // w
1536 Value
* val
= VSHUFFLE(BITCAST(vGatherResult
, v32x8Ty
),
1537 UndefValue::get(v32x8Ty
),
1538 vShuffleMasks
[swizzle
[i
]]);
1540 if ((extendType
== Instruction::CastOps::SExt
) ||
1541 (extendType
== Instruction::CastOps::SIToFP
))
1543 switch (conversionType
)
1545 case CONVERT_NORMALIZED
:
1546 val
= FMUL(SI_TO_FP(val
, mSimdFP32Ty
), VIMMED1((float)(1.0 / 127.0)));
1548 case CONVERT_SSCALED
:
1549 val
= SI_TO_FP(val
, mSimdFP32Ty
);
1551 case CONVERT_USCALED
:
1552 SWR_INVALID("Type should not be sign extended!");
1555 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1556 val
= S_EXT(val
, mSimdInt32Ty
);
1560 else if ((extendType
== Instruction::CastOps::ZExt
) ||
1561 (extendType
== Instruction::CastOps::UIToFP
))
1563 switch (conversionType
)
1565 case CONVERT_NORMALIZED
:
1566 val
= FMUL(UI_TO_FP(val
, mSimdFP32Ty
), VIMMED1((float)(1.0 / 255.0)));
1568 case CONVERT_SSCALED
:
1569 SWR_INVALID("Type should not be zero extended!");
1571 case CONVERT_USCALED
:
1572 val
= UI_TO_FP(val
, mSimdFP32Ty
);
1575 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1576 val
= Z_EXT(val
, mSimdInt32Ty
);
1582 SWR_INVALID("Unsupported conversion type");
1585 vVertexElements
[currentVertexElement
++] = val
;
1589 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1592 if (currentVertexElement
> 3)
1594 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1595 // reset to the next vVertexElement to output
1596 currentVertexElement
= 0;
1601 //////////////////////////////////////////////////////////////////////////
1602 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1603 /// denormalizes if needed, converts to F32 if needed, and positions in
1604 // the proper SIMD rows to be output to the simdvertex structure
1605 /// @param args: (tuple of args, listed below)
1606 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1607 /// @param pVtxOut - base pointer to output simdvertex struct
1608 /// @param extendType - sign extend or zero extend
1609 /// @param bNormalized - do we need to denormalize?
1610 /// @param currentVertexElement - reference to the current vVertexElement
1611 /// @param outputElt - reference to the current offset from simdvertex we're o
1612 /// @param compMask - component packing mask
1613 /// @param compCtrl - component control val
1614 /// @param vVertexElements[4] - vertex components to output
1615 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs
& args
)
1617 // Unpack tuple args
1618 Value
*(&vGatherResult
)[2] = std::get
<0>(args
);
1619 Value
* pVtxOut
= std::get
<1>(args
);
1620 const Instruction::CastOps extendType
= std::get
<2>(args
);
1621 const ConversionType conversionType
= std::get
<3>(args
);
1622 uint32_t& currentVertexElement
= std::get
<4>(args
);
1623 uint32_t& outputElt
= std::get
<5>(args
);
1624 const ComponentEnable compMask
= std::get
<6>(args
);
1625 const ComponentControl(&compCtrl
)[4] = std::get
<7>(args
);
1626 Value
*(&vVertexElements
)[4] = std::get
<8>(args
);
1629 Type
* vGatherTy
= VectorType::get(mInt32Ty
, 8);
1630 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, 32);
1632 // have to do extra work for sign extending
1633 if ((extendType
== Instruction::CastOps::SExt
) ||
1634 (extendType
== Instruction::CastOps::SIToFP
) || (extendType
== Instruction::CastOps::FPExt
))
1636 // is this PP float?
1637 bool bFP
= (extendType
== Instruction::CastOps::FPExt
) ? true : false;
1639 Type
* v8x16Ty
= VectorType::get(mInt16Ty
, 8); // 8x16bit in a 128bit lane
1640 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), 2);
1643 Value
* vConstMask
= C
<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1644 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1645 Value
* vi128XY_lo
= nullptr;
1646 Value
* vi128XY_hi
= nullptr;
1647 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 1))
1649 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for
1652 Value
* vGatherResult_lo
= BITCAST(EXTRACT_16(vGatherResult
[0], 0), v32x8Ty
);
1653 Value
* vGatherResult_hi
= BITCAST(EXTRACT_16(vGatherResult
[0], 1), v32x8Ty
);
1655 Value
* vShufResult_lo
= BITCAST(PSHUFB(vGatherResult_lo
, vConstMask
), vGatherTy
);
1656 Value
* vShufResult_hi
= BITCAST(PSHUFB(vGatherResult_hi
, vConstMask
), vGatherTy
);
1658 // after pshufb: group components together in each 128bit lane
1659 // 256i - 0 1 2 3 4 5 6 7
1660 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1662 vi128XY_lo
= BITCAST(
1663 VSHUFFLE(vShufResult_lo
, vShufResult_lo
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1665 vi128XY_hi
= BITCAST(
1666 VSHUFFLE(vShufResult_hi
, vShufResult_hi
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1669 // after PERMD: move and pack xy components into each 128bit lane
1670 // 256i - 0 1 2 3 4 5 6 7
1671 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1674 // do the same for zw components
1675 Value
* vi128ZW_lo
= nullptr;
1676 Value
* vi128ZW_hi
= nullptr;
1677 if (isComponentEnabled(compMask
, 2) || isComponentEnabled(compMask
, 3))
1679 Value
* vGatherResult_lo
= BITCAST(EXTRACT_16(vGatherResult
[1], 0), v32x8Ty
);
1680 Value
* vGatherResult_hi
= BITCAST(EXTRACT_16(vGatherResult
[1], 1), v32x8Ty
);
1682 Value
* vShufResult_lo
= BITCAST(PSHUFB(vGatherResult_lo
, vConstMask
), vGatherTy
);
1683 Value
* vShufResult_hi
= BITCAST(PSHUFB(vGatherResult_hi
, vConstMask
), vGatherTy
);
1685 vi128ZW_lo
= BITCAST(
1686 VSHUFFLE(vShufResult_lo
, vShufResult_lo
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1688 vi128ZW_hi
= BITCAST(
1689 VSHUFFLE(vShufResult_hi
, vShufResult_hi
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1693 // init denormalize variables if needed
1694 Instruction::CastOps IntToFpCast
;
1695 Value
* conversionFactor
;
1697 switch (conversionType
)
1699 case CONVERT_NORMALIZED
:
1700 IntToFpCast
= Instruction::CastOps::SIToFP
;
1701 conversionFactor
= VIMMED1((float)(1.0 / 32767.0));
1703 case CONVERT_SSCALED
:
1704 IntToFpCast
= Instruction::CastOps::SIToFP
;
1705 conversionFactor
= VIMMED1((float)(1.0));
1707 case CONVERT_USCALED
:
1708 SWR_INVALID("Type should not be sign extended!");
1709 conversionFactor
= nullptr;
1712 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1713 conversionFactor
= nullptr;
1717 // sign extend all enabled components. If we have a fill vVertexElements, output to current
1719 for (uint32_t i
= 0; i
< 4; i
++)
1721 if (isComponentEnabled(compMask
, i
))
1723 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1725 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1726 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1727 // if x or y, use vi128XY permute result, else use vi128ZW
1728 Value
* selectedPermute_lo
= (i
< 2) ? vi128XY_lo
: vi128ZW_lo
;
1729 Value
* selectedPermute_hi
= (i
< 2) ? vi128XY_hi
: vi128ZW_hi
;
1733 // extract 128 bit lanes to sign extend each component
1735 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo
, C(lane
)), v8x16Ty
));
1737 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi
, C(lane
)), v8x16Ty
));
1739 vVertexElements
[currentVertexElement
] = JOIN_16(temp_lo
, temp_hi
);
1743 // extract 128 bit lanes to sign extend each component
1745 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo
, C(lane
)), v8x16Ty
));
1747 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi
, C(lane
)), v8x16Ty
));
1749 Value
* temp
= JOIN_16(temp_lo
, temp_hi
);
1751 // denormalize if needed
1752 if (conversionType
!= CONVERT_NONE
)
1754 temp
= FMUL(CAST(IntToFpCast
, temp
, mSimdFP32Ty
), conversionFactor
);
1757 vVertexElements
[currentVertexElement
] = temp
;
1760 currentVertexElement
+= 1;
1764 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1767 if (currentVertexElement
> 3)
1769 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1770 // reset to the next vVertexElement to output
1771 currentVertexElement
= 0;
1777 else if ((extendType
== Instruction::CastOps::ZExt
) ||
1778 (extendType
== Instruction::CastOps::UIToFP
))
1780 // pshufb masks for each component
1781 Value
* vConstMask
[2];
1783 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 2))
1786 vConstMask
[0] = C
<char>({
1787 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1788 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1792 if (isComponentEnabled(compMask
, 1) || isComponentEnabled(compMask
, 3))
1795 vConstMask
[1] = C
<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1796 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1799 // init denormalize variables if needed
1800 Instruction::CastOps fpCast
;
1801 Value
* conversionFactor
;
1803 switch (conversionType
)
1805 case CONVERT_NORMALIZED
:
1806 fpCast
= Instruction::CastOps::UIToFP
;
1807 conversionFactor
= VIMMED1((float)(1.0 / 65535.0));
1809 case CONVERT_USCALED
:
1810 fpCast
= Instruction::CastOps::UIToFP
;
1811 conversionFactor
= VIMMED1((float)(1.0f
));
1813 case CONVERT_SSCALED
:
1814 SWR_INVALID("Type should not be zero extended!");
1815 conversionFactor
= nullptr;
1818 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1819 conversionFactor
= nullptr;
1823 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1824 for (uint32_t i
= 0; i
< 4; i
++)
1826 if (isComponentEnabled(compMask
, i
))
1828 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1830 // select correct constMask for x/z or y/w pshufb
1831 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1832 // if x or y, use vi128XY permute result, else use vi128ZW
1833 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
1835 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL,
1838 Value
* vGatherResult_lo
= EXTRACT_16(vGatherResult
[selectedGather
], 0);
1839 Value
* vGatherResult_hi
= EXTRACT_16(vGatherResult
[selectedGather
], 1);
1841 Value
* temp_lo
= BITCAST(
1842 PSHUFB(BITCAST(vGatherResult_lo
, v32x8Ty
), vConstMask
[selectedMask
]),
1844 Value
* temp_hi
= BITCAST(
1845 PSHUFB(BITCAST(vGatherResult_hi
, v32x8Ty
), vConstMask
[selectedMask
]),
1848 // after pshufb mask for x channel; z uses the same shuffle from the second
1849 // gather 256i - 0 1 2 3 4 5 6 7
1850 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1852 Value
* temp
= JOIN_16(temp_lo
, temp_hi
);
1854 // denormalize if needed
1855 if (conversionType
!= CONVERT_NONE
)
1857 temp
= FMUL(CAST(fpCast
, temp
, mSimdFP32Ty
), conversionFactor
);
1860 vVertexElements
[currentVertexElement
] = temp
;
1862 currentVertexElement
+= 1;
1866 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
1869 if (currentVertexElement
> 3)
1871 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
1872 // reset to the next vVertexElement to output
1873 currentVertexElement
= 0;
1880 SWR_INVALID("Unsupported conversion type");
1884 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs
& args
)
1886 // Unpack tuple args
1887 Value
*(&vGatherResult
)[2] = std::get
<0>(args
);
1888 Value
* pVtxOut
= std::get
<1>(args
);
1889 const Instruction::CastOps extendType
= std::get
<2>(args
);
1890 const ConversionType conversionType
= std::get
<3>(args
);
1891 uint32_t& currentVertexElement
= std::get
<4>(args
);
1892 uint32_t& outputElt
= std::get
<5>(args
);
1893 const ComponentEnable compMask
= std::get
<6>(args
);
1894 const ComponentControl(&compCtrl
)[4] = std::get
<7>(args
);
1895 Value
*(&vVertexElements
)[4] = std::get
<8>(args
);
1898 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
1899 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
1901 // have to do extra work for sign extending
1902 if ((extendType
== Instruction::CastOps::SExt
) ||
1903 (extendType
== Instruction::CastOps::SIToFP
) || (extendType
== Instruction::CastOps::FPExt
))
1905 // is this PP float?
1906 bool bFP
= (extendType
== Instruction::CastOps::FPExt
) ? true : false;
1908 Type
* v8x16Ty
= VectorType::get(mInt16Ty
, 8); // 8x16bit in a 128bit lane
1909 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128),
1910 mVWidth
/ 4); // vwidth is units of 32 bits
1913 Value
* vConstMask
= C
<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1914 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1915 Value
* vi128XY
= nullptr;
1916 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 1))
1918 Value
* vShufResult
=
1919 BITCAST(PSHUFB(BITCAST(vGatherResult
[0], v32x8Ty
), vConstMask
), vGatherTy
);
1920 // after pshufb: group components together in each 128bit lane
1921 // 256i - 0 1 2 3 4 5 6 7
1922 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1924 vi128XY
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
1925 // after PERMD: move and pack xy components into each 128bit lane
1926 // 256i - 0 1 2 3 4 5 6 7
1927 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1930 // do the same for zw components
1931 Value
* vi128ZW
= nullptr;
1932 if (isComponentEnabled(compMask
, 2) || isComponentEnabled(compMask
, 3))
1934 Value
* vShufResult
=
1935 BITCAST(PSHUFB(BITCAST(vGatherResult
[1], v32x8Ty
), vConstMask
), vGatherTy
);
1936 vi128ZW
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
1939 // init denormalize variables if needed
1940 Instruction::CastOps IntToFpCast
;
1941 Value
* conversionFactor
;
1943 switch (conversionType
)
1945 case CONVERT_NORMALIZED
:
1946 IntToFpCast
= Instruction::CastOps::SIToFP
;
1947 conversionFactor
= VIMMED1((float)(1.0 / 32767.0));
1949 case CONVERT_SSCALED
:
1950 IntToFpCast
= Instruction::CastOps::SIToFP
;
1951 conversionFactor
= VIMMED1((float)(1.0));
1953 case CONVERT_USCALED
:
1954 SWR_INVALID("Type should not be sign extended!");
1955 conversionFactor
= nullptr;
1958 SWR_ASSERT(conversionType
== CONVERT_NONE
);
1959 conversionFactor
= nullptr;
1963 // sign extend all enabled components. If we have a fill vVertexElements, output to current
1965 for (uint32_t i
= 0; i
< 4; i
++)
1967 if (isComponentEnabled(compMask
, i
))
1969 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
1971 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1972 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1973 // if x or y, use vi128XY permute result, else use vi128ZW
1974 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
1978 // extract 128 bit lanes to sign extend each component
1979 vVertexElements
[currentVertexElement
] =
1980 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute
, C(lane
)), v8x16Ty
));
1984 // extract 128 bit lanes to sign extend each component
1985 vVertexElements
[currentVertexElement
] =
1986 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute
, C(lane
)), v8x16Ty
));
1988 // denormalize if needed
1989 if (conversionType
!= CONVERT_NONE
)
1991 vVertexElements
[currentVertexElement
] =
1992 FMUL(CAST(IntToFpCast
,
1993 vVertexElements
[currentVertexElement
],
1998 currentVertexElement
++;
2002 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
2005 if (currentVertexElement
> 3)
2007 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
2008 // reset to the next vVertexElement to output
2009 currentVertexElement
= 0;
2015 else if ((extendType
== Instruction::CastOps::ZExt
) ||
2016 (extendType
== Instruction::CastOps::UIToFP
))
2018 // pshufb masks for each component
2019 Value
* vConstMask
[2];
2020 if (isComponentEnabled(compMask
, 0) || isComponentEnabled(compMask
, 2))
2023 vConstMask
[0] = C
<char>({
2024 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2025 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2029 if (isComponentEnabled(compMask
, 1) || isComponentEnabled(compMask
, 3))
2032 vConstMask
[1] = C
<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2033 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2036 // init denormalize variables if needed
2037 Instruction::CastOps fpCast
;
2038 Value
* conversionFactor
;
2040 switch (conversionType
)
2042 case CONVERT_NORMALIZED
:
2043 fpCast
= Instruction::CastOps::UIToFP
;
2044 conversionFactor
= VIMMED1((float)(1.0 / 65535.0));
2046 case CONVERT_USCALED
:
2047 fpCast
= Instruction::CastOps::UIToFP
;
2048 conversionFactor
= VIMMED1((float)(1.0f
));
2050 case CONVERT_SSCALED
:
2051 SWR_INVALID("Type should not be zero extended!");
2052 conversionFactor
= nullptr;
2055 SWR_ASSERT(conversionType
== CONVERT_NONE
);
2056 conversionFactor
= nullptr;
2060 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2061 for (uint32_t i
= 0; i
< 4; i
++)
2063 if (isComponentEnabled(compMask
, i
))
2065 if (compCtrl
[i
] == ComponentControl::StoreSrc
)
2067 // select correct constMask for x/z or y/w pshufb
2068 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
2069 // if x or y, use vi128XY permute result, else use vi128ZW
2070 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
2072 vVertexElements
[currentVertexElement
] =
2073 BITCAST(PSHUFB(BITCAST(vGatherResult
[selectedGather
], v32x8Ty
),
2074 vConstMask
[selectedMask
]),
2076 // after pshufb mask for x channel; z uses the same shuffle from the second
2077 // gather 256i - 0 1 2 3 4 5 6 7
2078 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2080 // denormalize if needed
2081 if (conversionType
!= CONVERT_NONE
)
2083 vVertexElements
[currentVertexElement
] =
2084 FMUL(CAST(fpCast
, vVertexElements
[currentVertexElement
], mSimdFP32Ty
),
2087 currentVertexElement
++;
2091 vVertexElements
[currentVertexElement
++] = GenerateCompCtrlVector(compCtrl
[i
]);
2094 if (currentVertexElement
> 3)
2096 StoreVertexElements(pVtxOut
, outputElt
++, 4, vVertexElements
);
2097 // reset to the next vVertexElement to output
2098 currentVertexElement
= 0;
2105 SWR_INVALID("Unsupported conversion type");
2109 //////////////////////////////////////////////////////////////////////////
2110 /// @brief Output a simdvertex worth of elements to the current outputElt
2111 /// @param pVtxOut - base address of VIN output struct
2112 /// @param outputElt - simdvertex offset in VIN to write to
2113 /// @param numEltsToStore - number of simdvertex rows to write out
2114 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2115 void FetchJit::StoreVertexElements(Value
* pVtxOut
,
2116 const uint32_t outputElt
,
2117 const uint32_t numEltsToStore
,
2118 Value
* (&vVertexElements
)[4])
2120 SWR_ASSERT(numEltsToStore
<= 4, "Invalid element count.");
2122 for (uint32_t c
= 0; c
< numEltsToStore
; ++c
)
2124 // STORE expects FP32 x vWidth type, just bitcast if needed
2125 if (!vVertexElements
[c
]->getType()->getScalarType()->isFloatTy())
2127 #if FETCH_DUMP_VERTEX
2128 PRINT("vVertexElements[%d]: 0x%x\n", {C(c
), vVertexElements
[c
]});
2130 vVertexElements
[c
] = BITCAST(vVertexElements
[c
], mSimdFP32Ty
);
2132 #if FETCH_DUMP_VERTEX
2135 PRINT("vVertexElements[%d]: %f\n", {C(c
), vVertexElements
[c
]});
2138 // outputElt * 4 = offsetting by the size of a simdvertex
2139 // + c offsets to a 32bit x vWidth row within the current vertex
2140 Value
* dest
= GEP(pVtxOut
, C(outputElt
* 4 + c
), nullptr, "destGEP");
2141 STORE(vVertexElements
[c
], dest
);
2145 //////////////////////////////////////////////////////////////////////////
2146 /// @brief Generates a constant vector of values based on the
2147 /// ComponentControl value
2148 /// @param ctrl - ComponentControl value
2149 Value
* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl
)
2158 return VIMMED1(1.0f
);
2165 Type
* pSimd8FPTy
= VectorType::get(mFP32Ty
, 8);
2167 BITCAST(LOAD(GEP(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_VertexID
})), pSimd8FPTy
);
2169 BITCAST(LOAD(GEP(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_VertexID2
})), pSimd8FPTy
);
2170 return JOIN_16(pIdLo
, pIdHi
);
2174 return BITCAST(LOAD(GEP(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_VertexID
})), mSimdFP32Ty
);
2177 case StoreInstanceId
:
2179 Value
* pId
= BITCAST(LOAD(GEP(mpFetchInfo
, {0, SWR_FETCH_CONTEXT_CurInstance
})), mFP32Ty
);
2180 return VBROADCAST(pId
);
2186 SWR_INVALID("Invalid component control");
2191 //////////////////////////////////////////////////////////////////////////
2192 /// @brief Returns the enable mask for the specified component.
2193 /// @param enableMask - enable bits
2194 /// @param component - component to check if enabled.
2195 bool isComponentEnabled(ComponentEnable enableMask
, uint8_t component
)
2201 return (enableMask
& ComponentEnable::X
);
2204 return (enableMask
& ComponentEnable::Y
);
2207 return (enableMask
& ComponentEnable::Z
);
2210 return (enableMask
& ComponentEnable::W
);
2217 // Don't want two threads compiling the same fetch shader simultaneously
2218 // Has problems in the JIT cache implementation
2219 // This is only a problem for fetch right now.
2220 static std::mutex gFetchCodegenMutex
;
2222 //////////////////////////////////////////////////////////////////////////
2223 /// @brief JITs from fetch shader IR
2224 /// @param hJitMgr - JitManager handle
2225 /// @param func - LLVM function IR
2226 /// @return PFN_FETCH_FUNC - pointer to fetch code
2227 PFN_FETCH_FUNC
JitFetchFunc(HANDLE hJitMgr
, const HANDLE hFunc
)
2229 const llvm::Function
* func
= (const llvm::Function
*)hFunc
;
2230 JitManager
* pJitMgr
= reinterpret_cast<JitManager
*>(hJitMgr
);
2231 PFN_FETCH_FUNC pfnFetch
;
2233 gFetchCodegenMutex
.lock();
2234 pfnFetch
= (PFN_FETCH_FUNC
)(pJitMgr
->mpExec
->getFunctionAddress(func
->getName().str()));
2235 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
2236 // add new IR to the module
2237 pJitMgr
->mIsModuleFinalized
= true;
2239 #if defined(KNOB_SWRC_TRACING)
2241 const char* funcName
= func
->getName().data();
2242 sprintf(fName
, "%s.bin", funcName
);
2243 FILE* fd
= fopen(fName
, "wb");
2244 fwrite((void*)pfnFetch
, 1, 2048, fd
);
2248 pJitMgr
->DumpAsm(const_cast<llvm::Function
*>(func
), "final");
2249 gFetchCodegenMutex
.unlock();
2255 //////////////////////////////////////////////////////////////////////////
2256 /// @brief JIT compiles fetch shader
2257 /// @param hJitMgr - JitManager handle
2258 /// @param state - fetch state to build function from
2259 extern "C" PFN_FETCH_FUNC JITCALL
JitCompileFetch(HANDLE hJitMgr
, const FETCH_COMPILE_STATE
& state
)
2261 JitManager
* pJitMgr
= reinterpret_cast<JitManager
*>(hJitMgr
);
2263 pJitMgr
->SetupNewModule();
2265 FetchJit
theJit(pJitMgr
);
2266 HANDLE hFunc
= theJit
.Create(state
);
2268 return JitFetchFunc(hJitMgr
, hFunc
);