af97b83cb2d6e3fb3261a8b3b40c346cd83c21c3
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder_gfx_mem.h"
32 #include "jit_api.h"
33 #include "fetch_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
36
37 //#define FETCH_DUMP_VERTEX 1
38 using namespace llvm;
39 using namespace SwrJit;
40
41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43 enum ConversionType
44 {
45 CONVERT_NONE,
46 CONVERT_NORMALIZED,
47 CONVERT_USCALED,
48 CONVERT_SSCALED,
49 CONVERT_SFIXED,
50 };
51
52 //////////////////////////////////////////////////////////////////////////
53 /// Interface to Jitting a fetch shader
54 //////////////////////////////////////////////////////////////////////////
55 struct FetchJit : public BuilderGfxMem
56 {
57 FetchJit(JitManager* pJitMgr) :
58 BuilderGfxMem(pJitMgr)
59 {}
60
61 Function* Create(const FETCH_COMPILE_STATE& fetchState);
62
63 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
64 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
65 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
66 template<typename T> Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
67
68 // package up Shuffle*bpcGatherd args into a tuple for convenience
69 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
70 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
71 const uint32_t(&)[4]> Shuffle8bpcArgs;
72
73 void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
74 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
75
76 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
77 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
78
79 void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
80 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
81
82 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
83
84 Value *GenerateCompCtrlVector(const ComponentControl ctrl);
85
86 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
87
88 bool IsOddFormat(SWR_FORMAT format);
89 bool IsUniformFormat(SWR_FORMAT format);
90 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
91 void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
92 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
93
94 Value* mpFetchInfo;
95 };
96
97 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
98 {
99 std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
100 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
101
102 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
103 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
104
105 fetch->getParent()->setModuleIdentifier(fetch->getName());
106
107 IRB()->SetInsertPoint(entry);
108
109 auto argitr = fetch->arg_begin();
110
111 // Fetch shader arguments
112 Value* privateContext = &*argitr; ++argitr;
113 privateContext->setName("privateContext");
114 SetPrivateContext(privateContext);
115
116 mpFetchInfo = &*argitr; ++argitr;
117 mpFetchInfo->setName("fetchInfo");
118 Value* pVtxOut = &*argitr;
119 pVtxOut->setName("vtxOutput");
120
121 uint32_t baseWidth = mVWidth;
122
123 SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
124
125 // Override builder target width to force 16-wide SIMD
126 #if USE_SIMD16_SHADERS
127 SetTargetWidth(16);
128 #endif
129
130 pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
131
132 // SWR_FETCH_CONTEXT::pStreams
133 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
134 streams->setName("pStreams");
135
136 // SWR_FETCH_CONTEXT::pIndices
137 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpIndices});
138 indices->setName("pIndices");
139
140 // SWR_FETCH_CONTEXT::pLastIndex
141 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpLastIndex});
142 pLastIndex->setName("pLastIndex");
143
144 Value* vIndices;
145 switch(fetchState.indexType)
146 {
147 case R8_UINT:
148 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
149 if(fetchState.bDisableIndexOOBCheck)
150 {
151 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
152 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
153 }
154 else
155 {
156 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
157 }
158 break;
159 case R16_UINT:
160 if(fetchState.bDisableIndexOOBCheck)
161 {
162 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
163 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
164 }
165 else
166 {
167 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
168 }
169 break;
170 case R32_UINT:
171 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH)
172 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
173 break; // incoming type is already 32bit int
174 default:
175 SWR_INVALID("Unsupported index type");
176 vIndices = nullptr;
177 break;
178 }
179
180 if(fetchState.bForceSequentialAccessEnable)
181 {
182 Value* pOffsets = mVWidth == 8 ? C({ 0, 1, 2, 3, 4, 5, 6, 7 }) :
183 C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
184
185 // VertexData buffers are accessed sequentially, the index is equal to the vertex number
186 vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
187 vIndices = ADD(vIndices, pOffsets);
188 }
189
190 Value* vVertexId = vIndices;
191 if (fetchState.bVertexIDOffsetEnable)
192 {
193 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
194 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
195 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
196 vVertexId = ADD(vIndices, vBaseVertex);
197 vVertexId = ADD(vVertexId, vStartVertex);
198 }
199
200 // store out vertex IDs
201 if (mVWidth == 16)
202 {
203 // store out in simd8 halves until core supports 16-wide natively
204 auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
205 auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
206 STORE(vVertexIdLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
207 STORE(vVertexIdHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
208 }
209 else if (mVWidth == 8)
210 {
211 STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
212 }
213
214 // store out cut mask if enabled
215 if (fetchState.bEnableCutIndex)
216 {
217 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
218 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
219
220 if (mVWidth == 16)
221 {
222 auto cutMaskLo = EXTRACT_16(cutMask, 0);
223 auto cutMaskHi = EXTRACT_16(cutMask, 1);
224 STORE(cutMaskLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
225 STORE(cutMaskHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
226 }
227 else if (mVWidth == 8)
228 {
229 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
230 }
231 }
232
233 // Fetch attributes from memory and output to a simdvertex struct
234 JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
235
236 RET_VOID();
237
238 JitManager::DumpToFile(fetch, "src");
239
240 #if defined(_DEBUG)
241 verifyFunction(*fetch);
242 #endif
243
244 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
245
246 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
247 setupPasses.add(createBreakCriticalEdgesPass());
248 setupPasses.add(createCFGSimplificationPass());
249 setupPasses.add(createEarlyCSEPass());
250 setupPasses.add(createPromoteMemoryToRegisterPass());
251
252 setupPasses.run(*fetch);
253
254 JitManager::DumpToFile(fetch, "se");
255
256 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
257
258 ///@todo Haven't touched these either. Need to remove some of these and add others.
259 optPasses.add(createCFGSimplificationPass());
260 optPasses.add(createEarlyCSEPass());
261 optPasses.add(createInstructionCombiningPass());
262 optPasses.add(createInstructionSimplifierPass());
263 optPasses.add(createConstantPropagationPass());
264 optPasses.add(createSCCPPass());
265 optPasses.add(createAggressiveDCEPass());
266
267 optPasses.run(*fetch);
268
269 optPasses.add(createLowerX86Pass(JM(), this));
270 optPasses.run(*fetch);
271
272 JitManager::DumpToFile(fetch, "opt");
273
274
275 // Revert 16-wide override
276 #if USE_SIMD16_SHADERS
277 SetTargetWidth(baseWidth);
278 #endif
279
280 return fetch;
281 }
282
283 // returns true for odd formats that require special state.gather handling
284 bool FetchJit::IsOddFormat(SWR_FORMAT format)
285 {
286 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
287 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
288 {
289 return true;
290 }
291 return false;
292 }
293
294 // format is uniform if all components are the same size and type
295 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
296 {
297 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
298 uint32_t bpc0 = info.bpc[0];
299 uint32_t type0 = info.type[0];
300
301 for (uint32_t c = 1; c < info.numComps; ++c)
302 {
303 if (bpc0 != info.bpc[c] || type0 != info.type[c])
304 {
305 return false;
306 }
307 }
308 return true;
309 }
310
311 // unpacks components based on format
312 // foreach component in the pixel
313 // mask off everything but this component
314 // shift component to LSB
315 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
316 {
317 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
318
319 uint32_t bitOffset = 0;
320 for (uint32_t c = 0; c < info.numComps; ++c)
321 {
322 uint32_t swizzledIndex = info.swizzle[c];
323 uint32_t compBits = info.bpc[c];
324 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
325 Value* comp = AND(vInput, bitmask);
326 comp = LSHR(comp, bitOffset);
327
328 result[swizzledIndex] = comp;
329 bitOffset += compBits;
330 }
331 }
332
333 // gather for odd component size formats
334 // gather SIMD full pixels per lane then shift/mask to move each component to their
335 // own vector
336 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
337 {
338 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
339
340 // only works if pixel size is <= 32bits
341 SWR_ASSERT(info.bpp <= 32);
342
343 Value *pGather;
344 if (info.bpp == 32)
345 {
346 pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
347 }
348 else
349 {
350 // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
351 Value *pMem = ALLOCA(mSimdInt32Ty);
352 STORE(VIMMED1(0u), pMem);
353
354 pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
355 Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
356
357 for (uint32_t lane = 0; lane < mVWidth; ++lane)
358 {
359 // Get index
360 Value* index = VEXTRACT(pOffsets, C(lane));
361 Value* mask = VEXTRACT(pMask, C(lane));
362 switch (info.bpp)
363 {
364 case 8:
365 {
366 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
367 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
368 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
369 break;
370 }
371
372 case 16:
373 {
374 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
375 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
376 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
377 break;
378 }
379 break;
380
381 case 24:
382 {
383 // First 16-bits of data
384 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
385 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
386 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
387
388 // Last 8-bits of data
389 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
390 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
391 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
392 break;
393 }
394
395 default:
396 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
397 break;
398 }
399 }
400
401 pGather = LOAD(pMem);
402 }
403
404 for (uint32_t comp = 0; comp < 4; ++comp)
405 {
406 pResult[comp] = VIMMED1((int)info.defaults[comp]);
407 }
408
409 UnpackComponents(format, pGather, pResult);
410
411 // cast to fp32
412 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
413 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
414 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
415 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
416 }
417
418 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
419 {
420 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
421
422 for (uint32_t c = 0; c < info.numComps; ++c)
423 {
424 uint32_t compIndex = info.swizzle[c];
425
426 // skip any conversion on UNUSED components
427 if (info.type[c] == SWR_TYPE_UNUSED)
428 {
429 continue;
430 }
431
432 if (info.isNormalized[c])
433 {
434 if (info.type[c] == SWR_TYPE_SNORM)
435 {
436 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
437
438 /// result = c * (1.0f / (2^(n-1) - 1);
439 uint32_t n = info.bpc[c];
440 uint32_t pow2 = 1 << (n - 1);
441 float scale = 1.0f / (float)(pow2 - 1);
442 Value *vScale = VIMMED1(scale);
443 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
444 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
445 texels[compIndex] = FMUL(texels[compIndex], vScale);
446 }
447 else
448 {
449 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
450
451 /// result = c * (1.0f / (2^n - 1))
452 uint32_t n = info.bpc[c];
453 uint32_t pow2 = 1 << n;
454 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
455 if (n == 24)
456 {
457 float scale = (float)(pow2 - 1);
458 Value* vScale = VIMMED1(scale);
459 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
460 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
461 texels[compIndex] = FDIV(texels[compIndex], vScale);
462 }
463 else
464 {
465 float scale = 1.0f / (float)(pow2 - 1);
466 Value *vScale = VIMMED1(scale);
467 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
468 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
469 texels[compIndex] = FMUL(texels[compIndex], vScale);
470 }
471 }
472 continue;
473 }
474 }
475 }
476
477 //////////////////////////////////////////////////////////////////////////
478 /// @brief Loads attributes from memory using AVX2 GATHER(s)
479 /// @param fetchState - info about attributes to be fetched from memory
480 /// @param streams - value pointer to the current vertex stream
481 /// @param vIndices - vector value of indices to gather
482 /// @param pVtxOut - value pointer to output simdvertex struct
483 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
484 Value* streams, Value* vIndices, Value* pVtxOut)
485 {
486 uint32_t currentVertexElement = 0;
487 uint32_t outputElt = 0;
488 Value* vVertexElements[4];
489
490 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
491 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
492 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
493 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
494 curInstance->setName("curInstance");
495
496 for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
497 {
498 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
499
500 // skip element if all components are disabled
501 if (ied.ComponentPacking == ComponentEnable::NONE)
502 {
503 continue;
504 }
505
506 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
507 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
508 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
509
510 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
511
512 // VGATHER* takes an *i8 src pointer
513 Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
514
515 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
516 Value *vStride = VBROADCAST(stride);
517
518 // max vertex index that is fully in bounds
519 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
520 maxVertex = LOAD(maxVertex);
521
522 Value *minVertex = NULL;
523 if (fetchState.bPartialVertexBuffer)
524 {
525 // min vertex index for low bounds OOB checking
526 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
527 minVertex = LOAD(minVertex);
528 }
529
530 if (fetchState.bInstanceIDOffsetEnable)
531 {
532 // the InstanceID (curInstance) value is offset by StartInstanceLocation
533 curInstance = ADD(curInstance, startInstance);
534 }
535
536 Value *vCurIndices;
537 Value *startOffset;
538 Value *vInstanceStride = VIMMED1(0);
539
540 if (ied.InstanceEnable)
541 {
542 Value* stepRate = C(ied.InstanceAdvancementState);
543
544 // prevent a div by 0 for 0 step rate
545 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
546 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
547
548 // calc the current offset into instanced data buffer
549 Value* calcInstance = UDIV(curInstance, stepRate);
550
551 // if step rate is 0, every instance gets instance 0
552 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
553
554 vCurIndices = VBROADCAST(calcInstance);
555 startOffset = startInstance;
556 }
557 else if (ied.InstanceStrideEnable)
558 {
559 // grab the instance advancement state, determines stride in bytes from one instance to the next
560 Value* stepRate = C(ied.InstanceAdvancementState);
561 vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
562
563 // offset indices by baseVertex
564 vCurIndices = ADD(vIndices, vBaseVertex);
565
566 startOffset = startVertex;
567 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
568 }
569 else
570 {
571 // offset indices by baseVertex
572 vCurIndices = ADD(vIndices, vBaseVertex);
573 startOffset = startVertex;
574 }
575
576 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
577 // do 64bit address offset calculations.
578
579 // calculate byte offset to the start of the VB
580 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
581 pStreamBase = GEP(pStreamBase, baseOffset);
582 Value* pStreamBaseGFX = ADD(stream, baseOffset);
583
584 // if we have a start offset, subtract from max vertex. Used for OOB check
585 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
586 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
587 // if we have a negative value, we're already OOB. clamp at 0.
588 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
589
590 if (fetchState.bPartialVertexBuffer)
591 {
592 // similary for min vertex
593 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
594 Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
595 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
596 }
597
598 // Load the in bounds size of a partially valid vertex
599 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
600 partialInboundsSize = LOAD(partialInboundsSize);
601 Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
602 Value *vBpp = VBROADCAST(C(info.Bpp));
603 Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
604
605 // is the element is <= the partially valid size
606 Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
607
608 // override cur indices with 0 if pitch is 0
609 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
610 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
611
612 // are vertices partially OOB?
613 Value* vMaxVertex = VBROADCAST(maxVertex);
614 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
615
616 // are vertices fully in bounds?
617 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
618
619 Value *vGatherMask;
620 if (fetchState.bPartialVertexBuffer)
621 {
622 // are vertices below minVertex limit?
623 Value *vMinVertex = VBROADCAST(minVertex);
624 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
625
626 // only fetch lanes that pass both tests
627 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
628 }
629 else
630 {
631 vGatherMask = vMaxGatherMask;
632 }
633
634 // blend in any partially OOB indices that have valid elements
635 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
636
637 // calculate the actual offsets into the VB
638 Value* vOffsets = MUL(vCurIndices, vStride);
639 vOffsets = ADD(vOffsets, vAlignmentOffsets);
640
641 // if instance stride enable is:
642 // true - add product of the instanceID and advancement state to the offst into the VB
643 // false - value of vInstanceStride has been initialialized to zero
644 vOffsets = ADD(vOffsets, vInstanceStride);
645
646 // Packing and component control
647 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
648 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
649 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
650
651 // Special gather/conversion for formats without equal component sizes
652 if (IsOddFormat((SWR_FORMAT)ied.Format))
653 {
654 Value *pResults[4];
655 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
656 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
657
658 for (uint32_t c = 0; c < 4; c += 1)
659 {
660 if (isComponentEnabled(compMask, c))
661 {
662 vVertexElements[currentVertexElement++] = pResults[c];
663 if (currentVertexElement > 3)
664 {
665 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
666 // reset to the next vVertexElement to output
667 currentVertexElement = 0;
668 }
669 }
670 }
671 }
672 else if(info.type[0] == SWR_TYPE_FLOAT)
673 {
674 ///@todo: support 64 bit vb accesses
675 Value *gatherSrc = VIMMED1(0.0f);
676
677 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
678 "Unsupported format for standard gather fetch.");
679
680 // Gather components from memory to store in a simdvertex structure
681 switch (bpc)
682 {
683 case 16:
684 {
685 Value *vGatherResult[2];
686
687 // if we have at least one component out of x or y to fetch
688 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
689 {
690 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
691 // e.g. result of first 8x32bit integer gather for 16bit components
692 // 256i - 0 1 2 3 4 5 6 7
693 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
694 //
695 }
696
697 // if we have at least one component out of z or w to fetch
698 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
699 {
700 // offset base to the next components(zw) in the vertex to gather
701 pStreamBase = GEP(pStreamBase, C((char)4));
702
703 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
704 // e.g. result of second 8x32bit integer gather for 16bit components
705 // 256i - 0 1 2 3 4 5 6 7
706 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
707 //
708 }
709
710 // if we have at least one component to shuffle into place
711 if (compMask)
712 {
713 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
714 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
715
716 // Shuffle gathered components into place in simdvertex struct
717 mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref
718 }
719 }
720 break;
721 case 32:
722 {
723 for (uint32_t i = 0; i < 4; i += 1)
724 {
725 if (isComponentEnabled(compMask, i))
726 {
727 // if we need to gather the component
728 if (compCtrl[i] == StoreSrc)
729 {
730 // Gather a SIMD of vertices
731 // APIs allow a 4GB range for offsets
732 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
733 // But, we know that elements must be aligned for FETCH. :)
734 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
735 Value *vShiftedOffsets = LSHR(vOffsets, 1);
736 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2, GFX_MEM_CLIENT_FETCH);
737 }
738 else
739 {
740 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
741 }
742
743 if (currentVertexElement > 3)
744 {
745 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
746 // reset to the next vVertexElement to output
747 currentVertexElement = 0;
748 }
749 }
750
751 // offset base to the next component in the vertex to gather
752 pStreamBase = GEP(pStreamBase, C((char)4));
753 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
754 }
755 }
756 break;
757 case 64:
758 {
759 for (uint32_t i = 0; i < 4; i += 1)
760 {
761 if (isComponentEnabled(compMask, i))
762 {
763 // if we need to gather the component
764 if (compCtrl[i] == StoreSrc)
765 {
766 Value* vShufLo;
767 Value* vShufHi;
768 Value* vShufAll;
769
770 if (mVWidth == 8)
771 {
772 vShufLo = C({ 0, 1, 2, 3 });
773 vShufHi = C({ 4, 5, 6, 7 });
774 vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
775 }
776 else
777 {
778 SWR_ASSERT(mVWidth == 16);
779 vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
780 vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 });
781 vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
782 }
783
784 Value *vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
785 Value *vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
786
787 Value *vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
788 Value *vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
789
790 Value *vZeroDouble = VECTOR_SPLAT(mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
791
792 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
793 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
794
795 pGatherLo = VCVTPD2PS(pGatherLo);
796 pGatherHi = VCVTPD2PS(pGatherHi);
797
798 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
799
800 vVertexElements[currentVertexElement++] = pGather;
801 }
802 else
803 {
804 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
805 }
806
807 if (currentVertexElement > 3)
808 {
809 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
810 // reset to the next vVertexElement to output
811 currentVertexElement = 0;
812 }
813 }
814
815 // offset base to the next component in the vertex to gather
816 pStreamBase = GEP(pStreamBase, C((char)8));
817 }
818 }
819 break;
820 default:
821 SWR_INVALID("Tried to fetch invalid FP format");
822 break;
823 }
824 }
825 else
826 {
827 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
828 ConversionType conversionType = CONVERT_NONE;
829
830 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
831 "Unsupported format for standard gather fetch.");
832
833 switch(info.type[0])
834 {
835 case SWR_TYPE_UNORM:
836 conversionType = CONVERT_NORMALIZED;
837 case SWR_TYPE_UINT:
838 extendCastType = Instruction::CastOps::ZExt;
839 break;
840 case SWR_TYPE_SNORM:
841 conversionType = CONVERT_NORMALIZED;
842 case SWR_TYPE_SINT:
843 extendCastType = Instruction::CastOps::SExt;
844 break;
845 case SWR_TYPE_USCALED:
846 conversionType = CONVERT_USCALED;
847 extendCastType = Instruction::CastOps::UIToFP;
848 break;
849 case SWR_TYPE_SSCALED:
850 conversionType = CONVERT_SSCALED;
851 extendCastType = Instruction::CastOps::SIToFP;
852 break;
853 case SWR_TYPE_SFIXED:
854 conversionType = CONVERT_SFIXED;
855 extendCastType = Instruction::CastOps::SExt;
856 break;
857 default:
858 break;
859 }
860
861 // value substituted when component of gather is masked
862 Value* gatherSrc = VIMMED1(0);
863
864 // Gather components from memory to store in a simdvertex structure
865 switch (bpc)
866 {
867 case 8:
868 {
869 // if we have at least one component to fetch
870 if (compMask)
871 {
872 Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
873 // e.g. result of an 8x32bit integer gather for 8bit components
874 // 256i - 0 1 2 3 4 5 6 7
875 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
876
877 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
878 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
879
880 // Shuffle gathered components into place in simdvertex struct
881 mVWidth == 16 ? Shuffle8bpcGatherd16(args) : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
882 }
883 }
884 break;
885 case 16:
886 {
887 Value *vGatherResult[2];
888
889 // if we have at least one component out of x or y to fetch
890 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
891 {
892 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
893 // e.g. result of first 8x32bit integer gather for 16bit components
894 // 256i - 0 1 2 3 4 5 6 7
895 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
896 //
897 }
898
899 // if we have at least one component out of z or w to fetch
900 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
901 {
902 // offset base to the next components(zw) in the vertex to gather
903 pStreamBase = GEP(pStreamBase, C((char)4));
904
905 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
906 // e.g. result of second 8x32bit integer gather for 16bit components
907 // 256i - 0 1 2 3 4 5 6 7
908 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
909 //
910 }
911
912 // if we have at least one component to shuffle into place
913 if (compMask)
914 {
915 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
916 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
917
918 // Shuffle gathered components into place in simdvertex struct
919 mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref
920 }
921 }
922 break;
923 case 32:
924 {
925 // Gathered components into place in simdvertex struct
926 for (uint32_t i = 0; i < 4; i++)
927 {
928 if (isComponentEnabled(compMask, i))
929 {
930 // if we need to gather the component
931 if (compCtrl[i] == StoreSrc)
932 {
933 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
934
935 if (conversionType == CONVERT_USCALED)
936 {
937 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
938 }
939 else if (conversionType == CONVERT_SSCALED)
940 {
941 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
942 }
943 else if (conversionType == CONVERT_SFIXED)
944 {
945 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
946 }
947
948 vVertexElements[currentVertexElement++] = pGather;
949
950 // e.g. result of a single 8x32bit integer gather for 32bit components
951 // 256i - 0 1 2 3 4 5 6 7
952 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
953 }
954 else
955 {
956 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
957 }
958
959 if (currentVertexElement > 3)
960 {
961 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
962
963 // reset to the next vVertexElement to output
964 currentVertexElement = 0;
965 }
966
967 }
968
969 // offset base to the next component in the vertex to gather
970 pStreamBase = GEP(pStreamBase, C((char)4));
971 }
972 }
973 break;
974 }
975 }
976 }
977
978 // if we have a partially filled vVertexElement struct, output it
979 if (currentVertexElement > 0)
980 {
981 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
982 }
983 }
984
985 typedef void*(*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va);
986 extern "C" void GetSimdValid8bitIndicesGfx(gfxptr_t indices, gfxptr_t lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* pdc, uint32_t* outIndices);
987 extern "C" void GetSimdValid16bitIndicesGfx(gfxptr_t indices, gfxptr_t lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* pdc, uint32_t* outIndices);
988
989 template<typename T> Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
990 {
991 SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, "Function expects gfxptr_t for both input parameters.");
992
993 Type* Ty = nullptr;
994
995 static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t), "Unsupported type for use with GetSimdValidIndicesHelper<T>");
996 constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
997 if (bSize)
998 {
999 Ty = mInt16PtrTy;
1000 }
1001 else if (sizeof(T) == sizeof(uint8_t))
1002 {
1003 Ty = mInt8PtrTy;
1004 }
1005 else
1006 {
1007 SWR_ASSERT(false, "This should never happen as per static_assert above.");
1008 }
1009
1010 Value* vIndices = VUNDEF_I();
1011
1012 {
1013 // store 0 index on stack to be used to conditionally load from if index address is OOB
1014 Value* pZeroIndex = ALLOCA(Ty);
1015 STORE(C((T)0), pZeroIndex);
1016
1017 // Load a SIMD of index pointers
1018 for (int64_t lane = 0; lane < mVWidth; lane++)
1019 {
1020 // Calculate the address of the requested index
1021 Value *pIndex = GEP(pIndices, C(lane), Ty);
1022
1023 pLastIndex = INT_TO_PTR(pLastIndex, Ty);
1024
1025 // check if the address is less than the max index,
1026 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1027
1028 // if valid, load the index. if not, load 0 from the stack
1029 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1030 Value *index = LOAD(pValid, "valid index", Ty, GFX_MEM_CLIENT_FETCH);
1031
1032 // zero extended index to 32 bits and insert into the correct simd lane
1033 index = Z_EXT(index, mInt32Ty);
1034 vIndices = VINSERT(vIndices, index, lane);
1035 }
1036 }
1037
1038 return vIndices;
1039 }
1040
1041 //////////////////////////////////////////////////////////////////////////
1042 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1043 /// *Note* have to do 8bit index checking in scalar until we have AVX-512
1044 /// support
1045 /// @param pIndices - pointer to 8 bit indices
1046 /// @param pLastIndex - pointer to last valid index
1047 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1048 {
1049 return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
1050 }
1051
1052 //////////////////////////////////////////////////////////////////////////
1053 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1054 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1055 /// support
1056 /// @param pIndices - pointer to 16 bit indices
1057 /// @param pLastIndex - pointer to last valid index
1058 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1059 {
1060 return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
1061 }
1062
1063 //////////////////////////////////////////////////////////////////////////
1064 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1065 /// @param pIndices - pointer to 32 bit indices
1066 /// @param pLastIndex - pointer to last valid index
1067 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1068 {
1069 DataLayout dL(JM()->mpCurrentModule);
1070 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1071 Value* iLastIndex = pLastIndex;
1072 Value* iIndices = pIndices;
1073
1074 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1075 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1076 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1077 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1078
1079 // create a vector of index counts from the base index ptr passed into the fetch
1080 Constant* vIndexOffsets;
1081 if (mVWidth == 8)
1082 {
1083 vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
1084 }
1085 else
1086 {
1087 vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
1088 }
1089
1090 // compare index count to the max valid index
1091 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1092 // vIndexOffsets 0 1 2 3 4 5 6 7
1093 // ------------------------------
1094 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1095 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1096 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1097 Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1098
1099 // Load the indices; OOB loads 0
1100 pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0));
1101 return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0));
1102 }
1103
1104 //////////////////////////////////////////////////////////////////////////
1105 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1106 /// denormalizes if needed, converts to F32 if needed, and positions in
1107 // the proper SIMD rows to be output to the simdvertex structure
1108 /// @param args: (tuple of args, listed below)
1109 /// @param vGatherResult - 8 gathered 8bpc vertices
1110 /// @param pVtxOut - base pointer to output simdvertex struct
1111 /// @param extendType - sign extend or zero extend
1112 /// @param bNormalized - do we need to denormalize?
1113 /// @param currentVertexElement - reference to the current vVertexElement
1114 /// @param outputElt - reference to the current offset from simdvertex we're o
1115 /// @param compMask - component packing mask
1116 /// @param compCtrl - component control val
1117 /// @param vVertexElements[4] - vertex components to output
1118 /// @param swizzle[4] - component swizzle location
1119 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
1120 {
1121 // Unpack tuple args
1122 Value*& vGatherResult = std::get<0>(args);
1123 Value* pVtxOut = std::get<1>(args);
1124 const Instruction::CastOps extendType = std::get<2>(args);
1125 const ConversionType conversionType = std::get<3>(args);
1126 uint32_t &currentVertexElement = std::get<4>(args);
1127 uint32_t &outputElt = std::get<5>(args);
1128 const ComponentEnable compMask = std::get<6>(args);
1129 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1130 Value* (&vVertexElements)[4] = std::get<8>(args);
1131 const uint32_t(&swizzle)[4] = std::get<9>(args);
1132
1133 // cast types
1134 Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1135 Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1136
1137 // have to do extra work for sign extending
1138 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1139 {
1140 Type *v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1141 Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1142
1143 // shuffle mask, including any swizzling
1144 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1145 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1146 Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
1147 char(y), char(y + 4), char(y + 8), char(y + 12),
1148 char(z), char(z + 4), char(z + 8), char(z + 12),
1149 char(w), char(w + 4), char(w + 8), char(w + 12),
1150 char(x), char(x + 4), char(x + 8), char(x + 12),
1151 char(y), char(y + 4), char(y + 8), char(y + 12),
1152 char(z), char(z + 4), char(z + 8), char(z + 12),
1153 char(w), char(w + 4), char(w + 8), char(w + 12) });
1154
1155 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1156
1157 Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1158 Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1159
1160 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1161 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1162
1163 // after pshufb: group components together in each 128bit lane
1164 // 256i - 0 1 2 3 4 5 6 7
1165 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1166
1167 Value *vi128XY_lo = nullptr;
1168 Value *vi128XY_hi = nullptr;
1169 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1170 {
1171 vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1172 vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1173
1174 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1175 // 256i - 0 1 2 3 4 5 6 7
1176 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1177 }
1178
1179 // do the same for zw components
1180 Value *vi128ZW_lo = nullptr;
1181 Value *vi128ZW_hi = nullptr;
1182 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1183 {
1184 vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1185 vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1186 }
1187
1188 // init denormalize variables if needed
1189 Instruction::CastOps fpCast;
1190 Value *conversionFactor;
1191
1192 switch (conversionType)
1193 {
1194 case CONVERT_NORMALIZED:
1195 fpCast = Instruction::CastOps::SIToFP;
1196 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1197 break;
1198 case CONVERT_SSCALED:
1199 fpCast = Instruction::CastOps::SIToFP;
1200 conversionFactor = VIMMED1((float)(1.0));
1201 break;
1202 case CONVERT_USCALED:
1203 SWR_INVALID("Type should not be sign extended!");
1204 conversionFactor = nullptr;
1205 break;
1206 default:
1207 SWR_ASSERT(conversionType == CONVERT_NONE);
1208 conversionFactor = nullptr;
1209 break;
1210 }
1211
1212 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1213 for (uint32_t i = 0; i < 4; i++)
1214 {
1215 if (isComponentEnabled(compMask, i))
1216 {
1217 if (compCtrl[i] == ComponentControl::StoreSrc)
1218 {
1219 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1220 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1221 // if x or y, use vi128XY permute result, else use vi128ZW
1222 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1223 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1224
1225 // sign extend
1226 Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1227 Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1228
1229 Value* temp = JOIN_16(temp_lo, temp_hi);
1230
1231 // denormalize if needed
1232 if (conversionType != CONVERT_NONE)
1233 {
1234 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1235 }
1236
1237 vVertexElements[currentVertexElement] = temp;
1238
1239 currentVertexElement += 1;
1240 }
1241 else
1242 {
1243 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1244 }
1245
1246 if (currentVertexElement > 3)
1247 {
1248 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1249 // reset to the next vVertexElement to output
1250 currentVertexElement = 0;
1251 }
1252 }
1253 }
1254 }
1255 // else zero extend
1256 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1257 {
1258 // init denormalize variables if needed
1259 Instruction::CastOps fpCast;
1260 Value *conversionFactor;
1261
1262 switch (conversionType)
1263 {
1264 case CONVERT_NORMALIZED:
1265 fpCast = Instruction::CastOps::UIToFP;
1266 conversionFactor = VIMMED1((float)(1.0 / 255.0));
1267 break;
1268 case CONVERT_USCALED:
1269 fpCast = Instruction::CastOps::UIToFP;
1270 conversionFactor = VIMMED1((float)(1.0));
1271 break;
1272 case CONVERT_SSCALED:
1273 SWR_INVALID("Type should not be zero extended!");
1274 conversionFactor = nullptr;
1275 break;
1276 default:
1277 SWR_ASSERT(conversionType == CONVERT_NONE);
1278 conversionFactor = nullptr;
1279 break;
1280 }
1281
1282 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1283 for (uint32_t i = 0; i < 4; i++)
1284 {
1285 if (isComponentEnabled(compMask, i))
1286 {
1287 if (compCtrl[i] == ComponentControl::StoreSrc)
1288 {
1289 // pshufb masks for each component
1290 Value *vConstMask;
1291 switch (swizzle[i])
1292 {
1293 case 0:
1294 // x shuffle mask
1295 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1296 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1297 break;
1298 case 1:
1299 // y shuffle mask
1300 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1301 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1302 break;
1303 case 2:
1304 // z shuffle mask
1305 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1306 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1307 break;
1308 case 3:
1309 // w shuffle mask
1310 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1311 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1312 break;
1313 default:
1314 vConstMask = nullptr;
1315 break;
1316 }
1317
1318 Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1319 Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1320
1321 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1322 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1323
1324 // after pshufb for x channel
1325 // 256i - 0 1 2 3 4 5 6 7
1326 // x000 x000 x000 x000 x000 x000 x000 x000
1327
1328 Value* temp = JOIN_16(temp_lo, temp_hi);
1329
1330 // denormalize if needed
1331 if (conversionType != CONVERT_NONE)
1332 {
1333 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1334 }
1335
1336 vVertexElements[currentVertexElement] = temp;
1337
1338 currentVertexElement += 1;
1339 }
1340 else
1341 {
1342 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1343 }
1344
1345 if (currentVertexElement > 3)
1346 {
1347 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1348 // reset to the next vVertexElement to output
1349 currentVertexElement = 0;
1350 }
1351 }
1352 }
1353 }
1354 else
1355 {
1356 SWR_INVALID("Unsupported conversion type");
1357 }
1358 }
1359
1360 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1361 {
1362 // Unpack tuple args
1363 Value*& vGatherResult = std::get<0>(args);
1364 Value* pVtxOut = std::get<1>(args);
1365 const Instruction::CastOps extendType = std::get<2>(args);
1366 const ConversionType conversionType = std::get<3>(args);
1367 uint32_t &currentVertexElement = std::get<4>(args);
1368 uint32_t &outputElt = std::get<5>(args);
1369 const ComponentEnable compMask = std::get<6>(args);
1370 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1371 Value* (&vVertexElements)[4] = std::get<8>(args);
1372 const uint32_t(&swizzle)[4] = std::get<9>(args);
1373
1374 // cast types
1375 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1376
1377 for (uint32_t i = 0; i < 4; i++)
1378 {
1379 if (!isComponentEnabled(compMask, i))
1380 continue;
1381
1382 if (compCtrl[i] == ComponentControl::StoreSrc)
1383 {
1384 std::vector<uint32_t> vShuffleMasks[4] = {
1385 { 0, 4, 8, 12, 16, 20, 24, 28 }, // x
1386 { 1, 5, 9, 13, 17, 21, 25, 29 }, // y
1387 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
1388 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
1389 };
1390
1391 Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1392 UndefValue::get(v32x8Ty),
1393 vShuffleMasks[swizzle[i]]);
1394
1395 if ((extendType == Instruction::CastOps::SExt) ||
1396 (extendType == Instruction::CastOps::SIToFP)) {
1397 switch (conversionType)
1398 {
1399 case CONVERT_NORMALIZED:
1400 val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1401 break;
1402 case CONVERT_SSCALED:
1403 val = SI_TO_FP(val, mSimdFP32Ty);
1404 break;
1405 case CONVERT_USCALED:
1406 SWR_INVALID("Type should not be sign extended!");
1407 break;
1408 default:
1409 SWR_ASSERT(conversionType == CONVERT_NONE);
1410 val = S_EXT(val, mSimdInt32Ty);
1411 break;
1412 }
1413 }
1414 else if ((extendType == Instruction::CastOps::ZExt) ||
1415 (extendType == Instruction::CastOps::UIToFP)) {
1416 switch (conversionType)
1417 {
1418 case CONVERT_NORMALIZED:
1419 val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1420 break;
1421 case CONVERT_SSCALED:
1422 SWR_INVALID("Type should not be zero extended!");
1423 break;
1424 case CONVERT_USCALED:
1425 val = UI_TO_FP(val, mSimdFP32Ty);
1426 break;
1427 default:
1428 SWR_ASSERT(conversionType == CONVERT_NONE);
1429 val = Z_EXT(val, mSimdInt32Ty);
1430 break;
1431 }
1432 }
1433 else
1434 {
1435 SWR_INVALID("Unsupported conversion type");
1436 }
1437
1438 vVertexElements[currentVertexElement++] = val;
1439 }
1440 else
1441 {
1442 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1443 }
1444
1445 if (currentVertexElement > 3)
1446 {
1447 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1448 // reset to the next vVertexElement to output
1449 currentVertexElement = 0;
1450 }
1451 }
1452 }
1453
1454 //////////////////////////////////////////////////////////////////////////
1455 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1456 /// denormalizes if needed, converts to F32 if needed, and positions in
1457 // the proper SIMD rows to be output to the simdvertex structure
1458 /// @param args: (tuple of args, listed below)
1459 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1460 /// @param pVtxOut - base pointer to output simdvertex struct
1461 /// @param extendType - sign extend or zero extend
1462 /// @param bNormalized - do we need to denormalize?
1463 /// @param currentVertexElement - reference to the current vVertexElement
1464 /// @param outputElt - reference to the current offset from simdvertex we're o
1465 /// @param compMask - component packing mask
1466 /// @param compCtrl - component control val
1467 /// @param vVertexElements[4] - vertex components to output
1468 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
1469 {
1470 // Unpack tuple args
1471 Value* (&vGatherResult)[2] = std::get<0>(args);
1472 Value* pVtxOut = std::get<1>(args);
1473 const Instruction::CastOps extendType = std::get<2>(args);
1474 const ConversionType conversionType = std::get<3>(args);
1475 uint32_t &currentVertexElement = std::get<4>(args);
1476 uint32_t &outputElt = std::get<5>(args);
1477 const ComponentEnable compMask = std::get<6>(args);
1478 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1479 Value* (&vVertexElements)[4] = std::get<8>(args);
1480
1481 // cast types
1482 Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1483 Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1484
1485 // have to do extra work for sign extending
1486 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1487 {
1488 // is this PP float?
1489 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1490
1491 Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1492 Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1493
1494 // shuffle mask
1495 Value *vConstMask = C<uint8_t>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1496 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1497 Value *vi128XY_lo = nullptr;
1498 Value *vi128XY_hi = nullptr;
1499 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1500 {
1501 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1502
1503 Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1504 Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1505
1506 Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1507 Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1508
1509 // after pshufb: group components together in each 128bit lane
1510 // 256i - 0 1 2 3 4 5 6 7
1511 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1512
1513 vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1514 vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1515
1516 // after PERMD: move and pack xy components into each 128bit lane
1517 // 256i - 0 1 2 3 4 5 6 7
1518 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1519 }
1520
1521 // do the same for zw components
1522 Value *vi128ZW_lo = nullptr;
1523 Value *vi128ZW_hi = nullptr;
1524 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1525 {
1526 Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1527 Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1528
1529 Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1530 Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1531
1532 vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1533 vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1534 }
1535
1536 // init denormalize variables if needed
1537 Instruction::CastOps IntToFpCast;
1538 Value *conversionFactor;
1539
1540 switch (conversionType)
1541 {
1542 case CONVERT_NORMALIZED:
1543 IntToFpCast = Instruction::CastOps::SIToFP;
1544 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1545 break;
1546 case CONVERT_SSCALED:
1547 IntToFpCast = Instruction::CastOps::SIToFP;
1548 conversionFactor = VIMMED1((float)(1.0));
1549 break;
1550 case CONVERT_USCALED:
1551 SWR_INVALID("Type should not be sign extended!");
1552 conversionFactor = nullptr;
1553 break;
1554 default:
1555 SWR_ASSERT(conversionType == CONVERT_NONE);
1556 conversionFactor = nullptr;
1557 break;
1558 }
1559
1560 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1561 for (uint32_t i = 0; i < 4; i++)
1562 {
1563 if (isComponentEnabled(compMask, i))
1564 {
1565 if (compCtrl[i] == ComponentControl::StoreSrc)
1566 {
1567 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1568 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1569 // if x or y, use vi128XY permute result, else use vi128ZW
1570 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1571 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1572
1573 if (bFP)
1574 {
1575 // extract 128 bit lanes to sign extend each component
1576 Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1577 Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1578
1579 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1580 }
1581 else
1582 {
1583 // extract 128 bit lanes to sign extend each component
1584 Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1585 Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1586
1587 Value* temp = JOIN_16(temp_lo, temp_hi);
1588
1589 // denormalize if needed
1590 if (conversionType != CONVERT_NONE)
1591 {
1592 temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1593 }
1594
1595 vVertexElements[currentVertexElement] = temp;
1596 }
1597
1598 currentVertexElement += 1;
1599 }
1600 else
1601 {
1602 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1603 }
1604
1605 if (currentVertexElement > 3)
1606 {
1607 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1608 // reset to the next vVertexElement to output
1609 currentVertexElement = 0;
1610 }
1611 }
1612 }
1613 }
1614 // else zero extend
1615 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1616 {
1617 // pshufb masks for each component
1618 Value *vConstMask[2];
1619
1620 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1621 {
1622 // x/z shuffle mask
1623 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1624 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1625 }
1626
1627 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1628 {
1629 // y/w shuffle mask
1630 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1631 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1632 }
1633
1634 // init denormalize variables if needed
1635 Instruction::CastOps fpCast;
1636 Value* conversionFactor;
1637
1638 switch (conversionType)
1639 {
1640 case CONVERT_NORMALIZED:
1641 fpCast = Instruction::CastOps::UIToFP;
1642 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1643 break;
1644 case CONVERT_USCALED:
1645 fpCast = Instruction::CastOps::UIToFP;
1646 conversionFactor = VIMMED1((float)(1.0f));
1647 break;
1648 case CONVERT_SSCALED:
1649 SWR_INVALID("Type should not be zero extended!");
1650 conversionFactor = nullptr;
1651 break;
1652 default:
1653 SWR_ASSERT(conversionType == CONVERT_NONE);
1654 conversionFactor = nullptr;
1655 break;
1656 }
1657
1658 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1659 for (uint32_t i = 0; i < 4; i++)
1660 {
1661 if (isComponentEnabled(compMask, i))
1662 {
1663 if (compCtrl[i] == ComponentControl::StoreSrc)
1664 {
1665 // select correct constMask for x/z or y/w pshufb
1666 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1667 // if x or y, use vi128XY permute result, else use vi128ZW
1668 uint32_t selectedGather = (i < 2) ? 0 : 1;
1669
1670 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1671
1672 Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1673 Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1674
1675 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1676 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1677
1678 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1679 // 256i - 0 1 2 3 4 5 6 7
1680 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1681
1682 Value* temp = JOIN_16(temp_lo, temp_hi);
1683
1684 // denormalize if needed
1685 if (conversionType != CONVERT_NONE)
1686 {
1687 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1688 }
1689
1690 vVertexElements[currentVertexElement] = temp;
1691
1692 currentVertexElement += 1;
1693 }
1694 else
1695 {
1696 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1697 }
1698
1699 if (currentVertexElement > 3)
1700 {
1701 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1702 // reset to the next vVertexElement to output
1703 currentVertexElement = 0;
1704 }
1705 }
1706 }
1707 }
1708 else
1709 {
1710 SWR_INVALID("Unsupported conversion type");
1711 }
1712 }
1713
1714 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1715 {
1716 // Unpack tuple args
1717 Value* (&vGatherResult)[2] = std::get<0>(args);
1718 Value* pVtxOut = std::get<1>(args);
1719 const Instruction::CastOps extendType = std::get<2>(args);
1720 const ConversionType conversionType = std::get<3>(args);
1721 uint32_t &currentVertexElement = std::get<4>(args);
1722 uint32_t &outputElt = std::get<5>(args);
1723 const ComponentEnable compMask = std::get<6>(args);
1724 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1725 Value* (&vVertexElements)[4] = std::get<8>(args);
1726
1727 // cast types
1728 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1729 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1730
1731 // have to do extra work for sign extending
1732 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
1733 (extendType == Instruction::CastOps::FPExt))
1734 {
1735 // is this PP float?
1736 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1737
1738 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1739 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1740
1741 // shuffle mask
1742 Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1743 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1744 Value* vi128XY = nullptr;
1745 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
1746 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1747 // after pshufb: group components together in each 128bit lane
1748 // 256i - 0 1 2 3 4 5 6 7
1749 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1750
1751 vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1752 // after PERMD: move and pack xy components into each 128bit lane
1753 // 256i - 0 1 2 3 4 5 6 7
1754 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1755 }
1756
1757 // do the same for zw components
1758 Value* vi128ZW = nullptr;
1759 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
1760 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1761 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1762 }
1763
1764 // init denormalize variables if needed
1765 Instruction::CastOps IntToFpCast;
1766 Value* conversionFactor;
1767
1768 switch (conversionType)
1769 {
1770 case CONVERT_NORMALIZED:
1771 IntToFpCast = Instruction::CastOps::SIToFP;
1772 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1773 break;
1774 case CONVERT_SSCALED:
1775 IntToFpCast = Instruction::CastOps::SIToFP;
1776 conversionFactor = VIMMED1((float)(1.0));
1777 break;
1778 case CONVERT_USCALED:
1779 SWR_INVALID("Type should not be sign extended!");
1780 conversionFactor = nullptr;
1781 break;
1782 default:
1783 SWR_ASSERT(conversionType == CONVERT_NONE);
1784 conversionFactor = nullptr;
1785 break;
1786 }
1787
1788 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1789 for (uint32_t i = 0; i < 4; i++)
1790 {
1791 if (isComponentEnabled(compMask, i))
1792 {
1793 if (compCtrl[i] == ComponentControl::StoreSrc)
1794 {
1795 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1796 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1797 // if x or y, use vi128XY permute result, else use vi128ZW
1798 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1799
1800 if (bFP) {
1801 // extract 128 bit lanes to sign extend each component
1802 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1803 }
1804 else {
1805 // extract 128 bit lanes to sign extend each component
1806 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1807
1808 // denormalize if needed
1809 if (conversionType != CONVERT_NONE) {
1810 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1811 }
1812 }
1813 currentVertexElement++;
1814 }
1815 else
1816 {
1817 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1818 }
1819
1820 if (currentVertexElement > 3)
1821 {
1822 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1823 // reset to the next vVertexElement to output
1824 currentVertexElement = 0;
1825 }
1826 }
1827 }
1828 }
1829 // else zero extend
1830 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1831 {
1832 // pshufb masks for each component
1833 Value* vConstMask[2];
1834 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
1835 // x/z shuffle mask
1836 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1837 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1838 }
1839
1840 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
1841 // y/w shuffle mask
1842 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1843 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1844 }
1845
1846 // init denormalize variables if needed
1847 Instruction::CastOps fpCast;
1848 Value* conversionFactor;
1849
1850 switch (conversionType)
1851 {
1852 case CONVERT_NORMALIZED:
1853 fpCast = Instruction::CastOps::UIToFP;
1854 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1855 break;
1856 case CONVERT_USCALED:
1857 fpCast = Instruction::CastOps::UIToFP;
1858 conversionFactor = VIMMED1((float)(1.0f));
1859 break;
1860 case CONVERT_SSCALED:
1861 SWR_INVALID("Type should not be zero extended!");
1862 conversionFactor = nullptr;
1863 break;
1864 default:
1865 SWR_ASSERT(conversionType == CONVERT_NONE);
1866 conversionFactor = nullptr;
1867 break;
1868 }
1869
1870 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1871 for (uint32_t i = 0; i < 4; i++)
1872 {
1873 if (isComponentEnabled(compMask, i))
1874 {
1875 if (compCtrl[i] == ComponentControl::StoreSrc)
1876 {
1877 // select correct constMask for x/z or y/w pshufb
1878 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1879 // if x or y, use vi128XY permute result, else use vi128ZW
1880 uint32_t selectedGather = (i < 2) ? 0 : 1;
1881
1882 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1883 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1884 // 256i - 0 1 2 3 4 5 6 7
1885 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1886
1887 // denormalize if needed
1888 if (conversionType != CONVERT_NONE)
1889 {
1890 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1891 }
1892 currentVertexElement++;
1893 }
1894 else
1895 {
1896 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1897 }
1898
1899 if (currentVertexElement > 3)
1900 {
1901 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1902 // reset to the next vVertexElement to output
1903 currentVertexElement = 0;
1904 }
1905 }
1906 }
1907 }
1908 else
1909 {
1910 SWR_INVALID("Unsupported conversion type");
1911 }
1912 }
1913
1914 //////////////////////////////////////////////////////////////////////////
1915 /// @brief Output a simdvertex worth of elements to the current outputElt
1916 /// @param pVtxOut - base address of VIN output struct
1917 /// @param outputElt - simdvertex offset in VIN to write to
1918 /// @param numEltsToStore - number of simdvertex rows to write out
1919 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1920 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1921 {
1922 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1923
1924 for (uint32_t c = 0; c < numEltsToStore; ++c)
1925 {
1926 // STORE expects FP32 x vWidth type, just bitcast if needed
1927 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
1928 {
1929 #if FETCH_DUMP_VERTEX
1930 PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
1931 #endif
1932 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1933 }
1934 #if FETCH_DUMP_VERTEX
1935 else
1936 {
1937 PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
1938 }
1939 #endif
1940 // outputElt * 4 = offsetting by the size of a simdvertex
1941 // + c offsets to a 32bit x vWidth row within the current vertex
1942 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
1943 STORE(vVertexElements[c], dest);
1944 }
1945 }
1946
1947 //////////////////////////////////////////////////////////////////////////
1948 /// @brief Generates a constant vector of values based on the
1949 /// ComponentControl value
1950 /// @param ctrl - ComponentControl value
1951 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1952 {
1953 switch (ctrl)
1954 {
1955 case NoStore:
1956 return VUNDEF_I();
1957 case Store0:
1958 return VIMMED1(0);
1959 case Store1Fp:
1960 return VIMMED1(1.0f);
1961 case Store1Int:
1962 return VIMMED1(1);
1963 case StoreVertexId:
1964 {
1965 if (mVWidth == 16)
1966 {
1967 Type* pSimd8FPTy = VectorType::get(mFP32Ty, 8);
1968 Value *pIdLo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), pSimd8FPTy);
1969 Value *pIdHi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), pSimd8FPTy);
1970 return JOIN_16(pIdLo, pIdHi);
1971 }
1972 else
1973 {
1974 return BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1975 }
1976 }
1977 case StoreInstanceId:
1978 {
1979 Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1980 return VBROADCAST(pId);
1981 }
1982
1983
1984 case StoreSrc:
1985 default:
1986 SWR_INVALID("Invalid component control");
1987 return VUNDEF_I();
1988 }
1989 }
1990
1991 //////////////////////////////////////////////////////////////////////////
1992 /// @brief Returns the enable mask for the specified component.
1993 /// @param enableMask - enable bits
1994 /// @param component - component to check if enabled.
1995 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1996 {
1997 switch (component)
1998 {
1999 // X
2000 case 0: return (enableMask & ComponentEnable::X);
2001 // Y
2002 case 1: return (enableMask & ComponentEnable::Y);
2003 // Z
2004 case 2: return (enableMask & ComponentEnable::Z);
2005 // W
2006 case 3: return (enableMask & ComponentEnable::W);
2007
2008 default: return false;
2009 }
2010 }
2011
2012 // Don't want two threads compiling the same fetch shader simultaneously
2013 // Has problems in the JIT cache implementation
2014 // This is only a problem for fetch right now.
2015 static std::mutex gFetchCodegenMutex;
2016
2017 //////////////////////////////////////////////////////////////////////////
2018 /// @brief JITs from fetch shader IR
2019 /// @param hJitMgr - JitManager handle
2020 /// @param func - LLVM function IR
2021 /// @return PFN_FETCH_FUNC - pointer to fetch code
2022 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2023 {
2024 const llvm::Function* func = (const llvm::Function*)hFunc;
2025 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2026 PFN_FETCH_FUNC pfnFetch;
2027
2028 gFetchCodegenMutex.lock();
2029 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2030 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2031 pJitMgr->mIsModuleFinalized = true;
2032
2033 #if defined(KNOB_SWRC_TRACING)
2034 char fName[1024];
2035 const char *funcName = func->getName().data();
2036 sprintf(fName, "%s.bin", funcName);
2037 FILE *fd = fopen(fName, "wb");
2038 fwrite((void *)pfnFetch, 1, 2048, fd);
2039 fclose(fd);
2040 #endif
2041
2042 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2043 gFetchCodegenMutex.unlock();
2044
2045
2046
2047 return pfnFetch;
2048 }
2049
2050 //////////////////////////////////////////////////////////////////////////
2051 /// @brief JIT compiles fetch shader
2052 /// @param hJitMgr - JitManager handle
2053 /// @param state - fetch state to build function from
2054 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2055 {
2056 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2057
2058 pJitMgr->SetupNewModule();
2059
2060 FetchJit theJit(pJitMgr);
2061 HANDLE hFunc = theJit.Create(state);
2062
2063 return JitFetchFunc(hJitMgr, hFunc);
2064 }