767866f68b1329726f42170fb0665eb3564fb584
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder_gfx_mem.h"
32 #include "jit_api.h"
33 #include "fetch_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
36
37 //#define FETCH_DUMP_VERTEX 1
38 using namespace llvm;
39 using namespace SwrJit;
40
41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43 enum ConversionType
44 {
45 CONVERT_NONE,
46 CONVERT_NORMALIZED,
47 CONVERT_USCALED,
48 CONVERT_SSCALED,
49 CONVERT_SFIXED,
50 };
51
52 //////////////////////////////////////////////////////////////////////////
53 /// Interface to Jitting a fetch shader
54 //////////////////////////////////////////////////////////////////////////
55 struct FetchJit : public BuilderGfxMem
56 {
57 FetchJit(JitManager* pJitMgr) :
58 BuilderGfxMem(pJitMgr)
59 {}
60
61 Function* Create(const FETCH_COMPILE_STATE& fetchState);
62
63 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
64 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
65 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
66
67 // package up Shuffle*bpcGatherd args into a tuple for convenience
68 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
69 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
70 const uint32_t(&)[4]> Shuffle8bpcArgs;
71
72 void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
73 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
74
75 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
76 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
77
78 void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
79 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
80
81 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
82
83 Value *GenerateCompCtrlVector(const ComponentControl ctrl);
84
85 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
86
87 bool IsOddFormat(SWR_FORMAT format);
88 bool IsUniformFormat(SWR_FORMAT format);
89 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
90 void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
91 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
92
93 Value* mpFetchInfo;
94 };
95
96 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
97 {
98 std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
99 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
100
101 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
102 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
103
104 fetch->getParent()->setModuleIdentifier(fetch->getName());
105
106 IRB()->SetInsertPoint(entry);
107
108 auto argitr = fetch->arg_begin();
109
110 // Fetch shader arguments
111 Value* privateContext = &*argitr; ++argitr;
112 privateContext->setName("privateContext");
113 SetPrivateContext(privateContext);
114
115 mpFetchInfo = &*argitr; ++argitr;
116 mpFetchInfo->setName("fetchInfo");
117 Value* pVtxOut = &*argitr;
118 pVtxOut->setName("vtxOutput");
119
120 uint32_t baseWidth = mVWidth;
121
122 SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
123
124 // Override builder target width to force 16-wide SIMD
125 #if USE_SIMD16_SHADERS
126 SetTargetWidth(16);
127 #endif
128
129 pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
130
131 // SWR_FETCH_CONTEXT::pStreams
132 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
133 streams->setName("pStreams");
134
135 // SWR_FETCH_CONTEXT::pIndices
136 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpIndices});
137 indices->setName("pIndices");
138
139 // SWR_FETCH_CONTEXT::pLastIndex
140 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpLastIndex});
141 pLastIndex->setName("pLastIndex");
142
143 Value* vIndices;
144 switch(fetchState.indexType)
145 {
146 case R8_UINT:
147 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
148 if(fetchState.bDisableIndexOOBCheck)
149 {
150 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
151 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
152 }
153 else
154 {
155 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
156 }
157 break;
158 case R16_UINT:
159 if(fetchState.bDisableIndexOOBCheck)
160 {
161 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
162 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
163 }
164 else
165 {
166 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
167 }
168 break;
169 case R32_UINT:
170 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH)
171 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
172 break; // incoming type is already 32bit int
173 default:
174 SWR_INVALID("Unsupported index type");
175 vIndices = nullptr;
176 break;
177 }
178
179 if(fetchState.bForceSequentialAccessEnable)
180 {
181 Value* pOffsets = mVWidth == 8 ? C({ 0, 1, 2, 3, 4, 5, 6, 7 }) :
182 C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
183
184 // VertexData buffers are accessed sequentially, the index is equal to the vertex number
185 vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
186 vIndices = ADD(vIndices, pOffsets);
187 }
188
189 Value* vVertexId = vIndices;
190 if (fetchState.bVertexIDOffsetEnable)
191 {
192 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
193 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
194 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
195 vVertexId = ADD(vIndices, vBaseVertex);
196 vVertexId = ADD(vVertexId, vStartVertex);
197 }
198
199 // store out vertex IDs
200 if (mVWidth == 16)
201 {
202 // store out in simd8 halves until core supports 16-wide natively
203 auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
204 auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
205 STORE(vVertexIdLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
206 STORE(vVertexIdHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
207 }
208 else if (mVWidth == 8)
209 {
210 STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
211 }
212
213 // store out cut mask if enabled
214 if (fetchState.bEnableCutIndex)
215 {
216 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
217 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
218
219 if (mVWidth == 16)
220 {
221 auto cutMaskLo = EXTRACT_16(cutMask, 0);
222 auto cutMaskHi = EXTRACT_16(cutMask, 1);
223 STORE(cutMaskLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
224 STORE(cutMaskHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
225 }
226 else if (mVWidth == 8)
227 {
228 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
229 }
230 }
231
232 // Fetch attributes from memory and output to a simdvertex struct
233 JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
234
235 RET_VOID();
236
237 JitManager::DumpToFile(fetch, "src");
238
239 #if defined(_DEBUG)
240 verifyFunction(*fetch);
241 #endif
242
243 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
244
245 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
246 setupPasses.add(createBreakCriticalEdgesPass());
247 setupPasses.add(createCFGSimplificationPass());
248 setupPasses.add(createEarlyCSEPass());
249 setupPasses.add(createPromoteMemoryToRegisterPass());
250
251 setupPasses.run(*fetch);
252
253 JitManager::DumpToFile(fetch, "se");
254
255 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
256
257 ///@todo Haven't touched these either. Need to remove some of these and add others.
258 optPasses.add(createCFGSimplificationPass());
259 optPasses.add(createEarlyCSEPass());
260 optPasses.add(createInstructionCombiningPass());
261 optPasses.add(createInstructionSimplifierPass());
262 optPasses.add(createConstantPropagationPass());
263 optPasses.add(createSCCPPass());
264 optPasses.add(createAggressiveDCEPass());
265
266 optPasses.run(*fetch);
267
268 optPasses.add(createLowerX86Pass(JM(), this));
269 optPasses.run(*fetch);
270
271 JitManager::DumpToFile(fetch, "opt");
272
273
274 // Revert 16-wide override
275 #if USE_SIMD16_SHADERS
276 SetTargetWidth(baseWidth);
277 #endif
278
279 return fetch;
280 }
281
282 // returns true for odd formats that require special state.gather handling
283 bool FetchJit::IsOddFormat(SWR_FORMAT format)
284 {
285 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
286 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
287 {
288 return true;
289 }
290 return false;
291 }
292
293 // format is uniform if all components are the same size and type
294 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
295 {
296 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
297 uint32_t bpc0 = info.bpc[0];
298 uint32_t type0 = info.type[0];
299
300 for (uint32_t c = 1; c < info.numComps; ++c)
301 {
302 if (bpc0 != info.bpc[c] || type0 != info.type[c])
303 {
304 return false;
305 }
306 }
307 return true;
308 }
309
310 // unpacks components based on format
311 // foreach component in the pixel
312 // mask off everything but this component
313 // shift component to LSB
314 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
315 {
316 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
317
318 uint32_t bitOffset = 0;
319 for (uint32_t c = 0; c < info.numComps; ++c)
320 {
321 uint32_t swizzledIndex = info.swizzle[c];
322 uint32_t compBits = info.bpc[c];
323 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
324 Value* comp = AND(vInput, bitmask);
325 comp = LSHR(comp, bitOffset);
326
327 result[swizzledIndex] = comp;
328 bitOffset += compBits;
329 }
330 }
331
332 // gather for odd component size formats
333 // gather SIMD full pixels per lane then shift/mask to move each component to their
334 // own vector
335 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
336 {
337 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
338
339 // only works if pixel size is <= 32bits
340 SWR_ASSERT(info.bpp <= 32);
341
342 Value *pGather;
343 if (info.bpp == 32)
344 {
345 pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
346 }
347 else
348 {
349 // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
350 Value *pMem = ALLOCA(mSimdInt32Ty);
351 STORE(VIMMED1(0u), pMem);
352
353 pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
354 Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
355
356 for (uint32_t lane = 0; lane < mVWidth; ++lane)
357 {
358 // Get index
359 Value* index = VEXTRACT(pOffsets, C(lane));
360 Value* mask = VEXTRACT(pMask, C(lane));
361 switch (info.bpp)
362 {
363 case 8:
364 {
365 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
366 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
367 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
368 break;
369 }
370
371 case 16:
372 {
373 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
374 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
375 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
376 break;
377 }
378 break;
379
380 case 24:
381 {
382 // First 16-bits of data
383 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
384 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
385 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
386
387 // Last 8-bits of data
388 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
389 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
390 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
391 break;
392 }
393
394 default:
395 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
396 break;
397 }
398 }
399
400 pGather = LOAD(pMem);
401 }
402
403 for (uint32_t comp = 0; comp < 4; ++comp)
404 {
405 pResult[comp] = VIMMED1((int)info.defaults[comp]);
406 }
407
408 UnpackComponents(format, pGather, pResult);
409
410 // cast to fp32
411 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
412 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
413 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
414 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
415 }
416
417 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
418 {
419 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
420
421 for (uint32_t c = 0; c < info.numComps; ++c)
422 {
423 uint32_t compIndex = info.swizzle[c];
424
425 // skip any conversion on UNUSED components
426 if (info.type[c] == SWR_TYPE_UNUSED)
427 {
428 continue;
429 }
430
431 if (info.isNormalized[c])
432 {
433 if (info.type[c] == SWR_TYPE_SNORM)
434 {
435 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
436
437 /// result = c * (1.0f / (2^(n-1) - 1);
438 uint32_t n = info.bpc[c];
439 uint32_t pow2 = 1 << (n - 1);
440 float scale = 1.0f / (float)(pow2 - 1);
441 Value *vScale = VIMMED1(scale);
442 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
443 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
444 texels[compIndex] = FMUL(texels[compIndex], vScale);
445 }
446 else
447 {
448 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
449
450 /// result = c * (1.0f / (2^n - 1))
451 uint32_t n = info.bpc[c];
452 uint32_t pow2 = 1 << n;
453 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
454 if (n == 24)
455 {
456 float scale = (float)(pow2 - 1);
457 Value* vScale = VIMMED1(scale);
458 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
459 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
460 texels[compIndex] = FDIV(texels[compIndex], vScale);
461 }
462 else
463 {
464 float scale = 1.0f / (float)(pow2 - 1);
465 Value *vScale = VIMMED1(scale);
466 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
467 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
468 texels[compIndex] = FMUL(texels[compIndex], vScale);
469 }
470 }
471 continue;
472 }
473 }
474 }
475
476 //////////////////////////////////////////////////////////////////////////
477 /// @brief Loads attributes from memory using AVX2 GATHER(s)
478 /// @param fetchState - info about attributes to be fetched from memory
479 /// @param streams - value pointer to the current vertex stream
480 /// @param vIndices - vector value of indices to gather
481 /// @param pVtxOut - value pointer to output simdvertex struct
482 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
483 Value* streams, Value* vIndices, Value* pVtxOut)
484 {
485 uint32_t currentVertexElement = 0;
486 uint32_t outputElt = 0;
487 Value* vVertexElements[4];
488
489 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
490 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
491 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
492 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
493 curInstance->setName("curInstance");
494
495 for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
496 {
497 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
498
499 // skip element if all components are disabled
500 if (ied.ComponentPacking == ComponentEnable::NONE)
501 {
502 continue;
503 }
504
505 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
506 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
507 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
508
509 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
510
511 // VGATHER* takes an *i8 src pointer
512 Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
513
514 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
515 Value *vStride = VBROADCAST(stride);
516
517 // max vertex index that is fully in bounds
518 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
519 maxVertex = LOAD(maxVertex);
520
521 Value *minVertex = NULL;
522 if (fetchState.bPartialVertexBuffer)
523 {
524 // min vertex index for low bounds OOB checking
525 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
526 minVertex = LOAD(minVertex);
527 }
528
529 if (fetchState.bInstanceIDOffsetEnable)
530 {
531 // the InstanceID (curInstance) value is offset by StartInstanceLocation
532 curInstance = ADD(curInstance, startInstance);
533 }
534
535 Value *vCurIndices;
536 Value *startOffset;
537 Value *vInstanceStride = VIMMED1(0);
538
539 if (ied.InstanceEnable)
540 {
541 Value* stepRate = C(ied.InstanceAdvancementState);
542
543 // prevent a div by 0 for 0 step rate
544 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
545 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
546
547 // calc the current offset into instanced data buffer
548 Value* calcInstance = UDIV(curInstance, stepRate);
549
550 // if step rate is 0, every instance gets instance 0
551 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
552
553 vCurIndices = VBROADCAST(calcInstance);
554 startOffset = startInstance;
555 }
556 else if (ied.InstanceStrideEnable)
557 {
558 // grab the instance advancement state, determines stride in bytes from one instance to the next
559 Value* stepRate = C(ied.InstanceAdvancementState);
560 vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
561
562 // offset indices by baseVertex
563 vCurIndices = ADD(vIndices, vBaseVertex);
564
565 startOffset = startVertex;
566 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
567 }
568 else
569 {
570 // offset indices by baseVertex
571 vCurIndices = ADD(vIndices, vBaseVertex);
572 startOffset = startVertex;
573 }
574
575 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
576 // do 64bit address offset calculations.
577
578 // calculate byte offset to the start of the VB
579 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
580 pStreamBase = GEP(pStreamBase, baseOffset);
581 Value* pStreamBaseGFX = ADD(stream, baseOffset);
582
583 // if we have a start offset, subtract from max vertex. Used for OOB check
584 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
585 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
586 // if we have a negative value, we're already OOB. clamp at 0.
587 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
588
589 if (fetchState.bPartialVertexBuffer)
590 {
591 // similary for min vertex
592 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
593 Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
594 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
595 }
596
597 // Load the in bounds size of a partially valid vertex
598 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
599 partialInboundsSize = LOAD(partialInboundsSize);
600 Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
601 Value *vBpp = VBROADCAST(C(info.Bpp));
602 Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
603
604 // is the element is <= the partially valid size
605 Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
606
607 // override cur indices with 0 if pitch is 0
608 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
609 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
610
611 // are vertices partially OOB?
612 Value* vMaxVertex = VBROADCAST(maxVertex);
613 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
614
615 // are vertices fully in bounds?
616 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
617
618 Value *vGatherMask;
619 if (fetchState.bPartialVertexBuffer)
620 {
621 // are vertices below minVertex limit?
622 Value *vMinVertex = VBROADCAST(minVertex);
623 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
624
625 // only fetch lanes that pass both tests
626 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
627 }
628 else
629 {
630 vGatherMask = vMaxGatherMask;
631 }
632
633 // blend in any partially OOB indices that have valid elements
634 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
635
636 // calculate the actual offsets into the VB
637 Value* vOffsets = MUL(vCurIndices, vStride);
638 vOffsets = ADD(vOffsets, vAlignmentOffsets);
639
640 // if instance stride enable is:
641 // true - add product of the instanceID and advancement state to the offst into the VB
642 // false - value of vInstanceStride has been initialialized to zero
643 vOffsets = ADD(vOffsets, vInstanceStride);
644
645 // Packing and component control
646 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
647 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
648 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
649
650 // Special gather/conversion for formats without equal component sizes
651 if (IsOddFormat((SWR_FORMAT)ied.Format))
652 {
653 Value *pResults[4];
654 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
655 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
656
657 for (uint32_t c = 0; c < 4; c += 1)
658 {
659 if (isComponentEnabled(compMask, c))
660 {
661 vVertexElements[currentVertexElement++] = pResults[c];
662 if (currentVertexElement > 3)
663 {
664 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
665 // reset to the next vVertexElement to output
666 currentVertexElement = 0;
667 }
668 }
669 }
670 }
671 else if(info.type[0] == SWR_TYPE_FLOAT)
672 {
673 ///@todo: support 64 bit vb accesses
674 Value *gatherSrc = VIMMED1(0.0f);
675
676 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
677 "Unsupported format for standard gather fetch.");
678
679 // Gather components from memory to store in a simdvertex structure
680 switch (bpc)
681 {
682 case 16:
683 {
684 Value *vGatherResult[2];
685
686 // if we have at least one component out of x or y to fetch
687 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
688 {
689 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
690 // e.g. result of first 8x32bit integer gather for 16bit components
691 // 256i - 0 1 2 3 4 5 6 7
692 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
693 //
694 }
695
696 // if we have at least one component out of z or w to fetch
697 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
698 {
699 // offset base to the next components(zw) in the vertex to gather
700 pStreamBase = GEP(pStreamBase, C((char)4));
701
702 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
703 // e.g. result of second 8x32bit integer gather for 16bit components
704 // 256i - 0 1 2 3 4 5 6 7
705 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
706 //
707 }
708
709 // if we have at least one component to shuffle into place
710 if (compMask)
711 {
712 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
713 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
714
715 // Shuffle gathered components into place in simdvertex struct
716 mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref
717 }
718 }
719 break;
720 case 32:
721 {
722 for (uint32_t i = 0; i < 4; i += 1)
723 {
724 if (isComponentEnabled(compMask, i))
725 {
726 // if we need to gather the component
727 if (compCtrl[i] == StoreSrc)
728 {
729 // Gather a SIMD of vertices
730 // APIs allow a 4GB range for offsets
731 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
732 // But, we know that elements must be aligned for FETCH. :)
733 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
734 Value *vShiftedOffsets = LSHR(vOffsets, 1);
735 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2, GFX_MEM_CLIENT_FETCH);
736 }
737 else
738 {
739 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
740 }
741
742 if (currentVertexElement > 3)
743 {
744 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
745 // reset to the next vVertexElement to output
746 currentVertexElement = 0;
747 }
748 }
749
750 // offset base to the next component in the vertex to gather
751 pStreamBase = GEP(pStreamBase, C((char)4));
752 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
753 }
754 }
755 break;
756 case 64:
757 {
758 for (uint32_t i = 0; i < 4; i += 1)
759 {
760 if (isComponentEnabled(compMask, i))
761 {
762 // if we need to gather the component
763 if (compCtrl[i] == StoreSrc)
764 {
765 Value* vShufLo;
766 Value* vShufHi;
767 Value* vShufAll;
768
769 if (mVWidth == 8)
770 {
771 vShufLo = C({ 0, 1, 2, 3 });
772 vShufHi = C({ 4, 5, 6, 7 });
773 vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
774 }
775 else
776 {
777 SWR_ASSERT(mVWidth == 16);
778 vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
779 vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 });
780 vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
781 }
782
783 Value *vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
784 Value *vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
785
786 Value *vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
787 Value *vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
788
789 Value *vZeroDouble = VECTOR_SPLAT(mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
790
791 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
792 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
793
794 pGatherLo = VCVTPD2PS(pGatherLo);
795 pGatherHi = VCVTPD2PS(pGatherHi);
796
797 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
798
799 vVertexElements[currentVertexElement++] = pGather;
800 }
801 else
802 {
803 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
804 }
805
806 if (currentVertexElement > 3)
807 {
808 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
809 // reset to the next vVertexElement to output
810 currentVertexElement = 0;
811 }
812 }
813
814 // offset base to the next component in the vertex to gather
815 pStreamBase = GEP(pStreamBase, C((char)8));
816 }
817 }
818 break;
819 default:
820 SWR_INVALID("Tried to fetch invalid FP format");
821 break;
822 }
823 }
824 else
825 {
826 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
827 ConversionType conversionType = CONVERT_NONE;
828
829 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
830 "Unsupported format for standard gather fetch.");
831
832 switch(info.type[0])
833 {
834 case SWR_TYPE_UNORM:
835 conversionType = CONVERT_NORMALIZED;
836 case SWR_TYPE_UINT:
837 extendCastType = Instruction::CastOps::ZExt;
838 break;
839 case SWR_TYPE_SNORM:
840 conversionType = CONVERT_NORMALIZED;
841 case SWR_TYPE_SINT:
842 extendCastType = Instruction::CastOps::SExt;
843 break;
844 case SWR_TYPE_USCALED:
845 conversionType = CONVERT_USCALED;
846 extendCastType = Instruction::CastOps::UIToFP;
847 break;
848 case SWR_TYPE_SSCALED:
849 conversionType = CONVERT_SSCALED;
850 extendCastType = Instruction::CastOps::SIToFP;
851 break;
852 case SWR_TYPE_SFIXED:
853 conversionType = CONVERT_SFIXED;
854 extendCastType = Instruction::CastOps::SExt;
855 break;
856 default:
857 break;
858 }
859
860 // value substituted when component of gather is masked
861 Value* gatherSrc = VIMMED1(0);
862
863 // Gather components from memory to store in a simdvertex structure
864 switch (bpc)
865 {
866 case 8:
867 {
868 // if we have at least one component to fetch
869 if (compMask)
870 {
871 Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
872 // e.g. result of an 8x32bit integer gather for 8bit components
873 // 256i - 0 1 2 3 4 5 6 7
874 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
875
876 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
877 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
878
879 // Shuffle gathered components into place in simdvertex struct
880 mVWidth == 16 ? Shuffle8bpcGatherd16(args) : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
881 }
882 }
883 break;
884 case 16:
885 {
886 Value *vGatherResult[2];
887
888 // if we have at least one component out of x or y to fetch
889 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
890 {
891 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
892 // e.g. result of first 8x32bit integer gather for 16bit components
893 // 256i - 0 1 2 3 4 5 6 7
894 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
895 //
896 }
897
898 // if we have at least one component out of z or w to fetch
899 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
900 {
901 // offset base to the next components(zw) in the vertex to gather
902 pStreamBase = GEP(pStreamBase, C((char)4));
903
904 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
905 // e.g. result of second 8x32bit integer gather for 16bit components
906 // 256i - 0 1 2 3 4 5 6 7
907 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
908 //
909 }
910
911 // if we have at least one component to shuffle into place
912 if (compMask)
913 {
914 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
915 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
916
917 // Shuffle gathered components into place in simdvertex struct
918 mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref
919 }
920 }
921 break;
922 case 32:
923 {
924 // Gathered components into place in simdvertex struct
925 for (uint32_t i = 0; i < 4; i++)
926 {
927 if (isComponentEnabled(compMask, i))
928 {
929 // if we need to gather the component
930 if (compCtrl[i] == StoreSrc)
931 {
932 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
933
934 if (conversionType == CONVERT_USCALED)
935 {
936 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
937 }
938 else if (conversionType == CONVERT_SSCALED)
939 {
940 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
941 }
942 else if (conversionType == CONVERT_SFIXED)
943 {
944 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
945 }
946
947 vVertexElements[currentVertexElement++] = pGather;
948
949 // e.g. result of a single 8x32bit integer gather for 32bit components
950 // 256i - 0 1 2 3 4 5 6 7
951 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
952 }
953 else
954 {
955 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
956 }
957
958 if (currentVertexElement > 3)
959 {
960 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
961
962 // reset to the next vVertexElement to output
963 currentVertexElement = 0;
964 }
965
966 }
967
968 // offset base to the next component in the vertex to gather
969 pStreamBase = GEP(pStreamBase, C((char)4));
970 }
971 }
972 break;
973 }
974 }
975 }
976
977 // if we have a partially filled vVertexElement struct, output it
978 if (currentVertexElement > 0)
979 {
980 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
981 }
982 }
983
984 typedef void*(*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va);
985 extern "C" void GetSimdValid8bitIndicesGfx(gfxptr_t indices, gfxptr_t lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* pdc, uint32_t* outIndices);
986 extern "C" void GetSimdValid16bitIndicesGfx(gfxptr_t indices, gfxptr_t lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* pdc, uint32_t* outIndices);
987
988 //////////////////////////////////////////////////////////////////////////
989 /// @brief Loads a simd of valid indices. OOB indices are set to 0
990 /// *Note* have to do 8bit index checking in scalar until we have AVX-512
991 /// support
992 /// @param pIndices - pointer to 8 bit indices
993 /// @param pLastIndex - pointer to last valid index
994 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
995 {
996 SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, "Function expects gfxptr_t for both input parameters.");
997
998 Value* vIndices = VUNDEF_I();
999
1000 {
1001 // store 0 index on stack to be used to conditionally load from if index address is OOB
1002 Value* pZeroIndex = ALLOCA(mInt8Ty);
1003 STORE(C((uint8_t)0), pZeroIndex);
1004
1005 // Load a SIMD of index pointers
1006 for (int64_t lane = 0; lane < mVWidth; lane++)
1007 {
1008 // Calculate the address of the requested index
1009 Value *pIndex = GEP(pIndices, C(lane), mInt8PtrTy);
1010
1011 pLastIndex = INT_TO_PTR(pLastIndex, mInt8PtrTy);
1012
1013 // check if the address is less than the max index,
1014 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1015
1016 // if valid, load the index. if not, load 0 from the stack
1017 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1018 Value *index = LOAD(pValid, "valid index", PointerType::get(mInt8Ty, 0), GFX_MEM_CLIENT_FETCH);
1019
1020 // zero extended index to 32 bits and insert into the correct simd lane
1021 index = Z_EXT(index, mInt32Ty);
1022 vIndices = VINSERT(vIndices, index, lane);
1023 }
1024 }
1025
1026 return vIndices;
1027 }
1028
1029 //////////////////////////////////////////////////////////////////////////
1030 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1031 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1032 /// support
1033 /// @param pIndices - pointer to 16 bit indices
1034 /// @param pLastIndex - pointer to last valid index
1035 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1036 {
1037 SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, "Function expects gfxptr_t for both input parameters.");
1038
1039 Value* vIndices = VUNDEF_I();
1040
1041 {
1042 // store 0 index on stack to be used to conditionally load from if index address is OOB
1043 Value* pZeroIndex = ALLOCA(mInt16Ty);
1044 STORE(C((uint16_t)0), pZeroIndex);
1045
1046 // Load a SIMD of index pointers
1047 for (int64_t lane = 0; lane < mVWidth; lane++)
1048 {
1049 // Calculate the address of the requested index
1050 Value *pIndex = GEP(pIndices, C(lane), mInt16PtrTy);
1051
1052 pLastIndex = INT_TO_PTR(pLastIndex, mInt16PtrTy);
1053
1054 // check if the address is less than the max index,
1055 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1056
1057 // if valid, load the index. if not, load 0 from the stack
1058 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1059 Value *index = LOAD(pValid, "valid index", PointerType::get(mInt16Ty, 0), GFX_MEM_CLIENT_FETCH);
1060
1061 // zero extended index to 32 bits and insert into the correct simd lane
1062 index = Z_EXT(index, mInt32Ty);
1063 vIndices = VINSERT(vIndices, index, lane);
1064 }
1065 }
1066
1067 return vIndices;
1068 }
1069
1070 //////////////////////////////////////////////////////////////////////////
1071 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1072 /// @param pIndices - pointer to 32 bit indices
1073 /// @param pLastIndex - pointer to last valid index
1074 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1075 {
1076 DataLayout dL(JM()->mpCurrentModule);
1077 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1078 Value* iLastIndex = pLastIndex;
1079 Value* iIndices = pIndices;
1080
1081 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1082 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1083 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1084 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1085
1086 // create a vector of index counts from the base index ptr passed into the fetch
1087 Constant* vIndexOffsets;
1088 if (mVWidth == 8)
1089 {
1090 vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
1091 }
1092 else
1093 {
1094 vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
1095 }
1096
1097 // compare index count to the max valid index
1098 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1099 // vIndexOffsets 0 1 2 3 4 5 6 7
1100 // ------------------------------
1101 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1102 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1103 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1104 Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1105
1106 // Load the indices; OOB loads 0
1107 pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0));
1108 return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0));
1109 }
1110
1111 //////////////////////////////////////////////////////////////////////////
1112 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1113 /// denormalizes if needed, converts to F32 if needed, and positions in
1114 // the proper SIMD rows to be output to the simdvertex structure
1115 /// @param args: (tuple of args, listed below)
1116 /// @param vGatherResult - 8 gathered 8bpc vertices
1117 /// @param pVtxOut - base pointer to output simdvertex struct
1118 /// @param extendType - sign extend or zero extend
1119 /// @param bNormalized - do we need to denormalize?
1120 /// @param currentVertexElement - reference to the current vVertexElement
1121 /// @param outputElt - reference to the current offset from simdvertex we're o
1122 /// @param compMask - component packing mask
1123 /// @param compCtrl - component control val
1124 /// @param vVertexElements[4] - vertex components to output
1125 /// @param swizzle[4] - component swizzle location
1126 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
1127 {
1128 // Unpack tuple args
1129 Value*& vGatherResult = std::get<0>(args);
1130 Value* pVtxOut = std::get<1>(args);
1131 const Instruction::CastOps extendType = std::get<2>(args);
1132 const ConversionType conversionType = std::get<3>(args);
1133 uint32_t &currentVertexElement = std::get<4>(args);
1134 uint32_t &outputElt = std::get<5>(args);
1135 const ComponentEnable compMask = std::get<6>(args);
1136 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1137 Value* (&vVertexElements)[4] = std::get<8>(args);
1138 const uint32_t(&swizzle)[4] = std::get<9>(args);
1139
1140 // cast types
1141 Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1142 Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1143
1144 // have to do extra work for sign extending
1145 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1146 {
1147 Type *v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1148 Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1149
1150 // shuffle mask, including any swizzling
1151 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1152 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1153 Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
1154 char(y), char(y + 4), char(y + 8), char(y + 12),
1155 char(z), char(z + 4), char(z + 8), char(z + 12),
1156 char(w), char(w + 4), char(w + 8), char(w + 12),
1157 char(x), char(x + 4), char(x + 8), char(x + 12),
1158 char(y), char(y + 4), char(y + 8), char(y + 12),
1159 char(z), char(z + 4), char(z + 8), char(z + 12),
1160 char(w), char(w + 4), char(w + 8), char(w + 12) });
1161
1162 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1163
1164 Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1165 Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1166
1167 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1168 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1169
1170 // after pshufb: group components together in each 128bit lane
1171 // 256i - 0 1 2 3 4 5 6 7
1172 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1173
1174 Value *vi128XY_lo = nullptr;
1175 Value *vi128XY_hi = nullptr;
1176 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1177 {
1178 vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1179 vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1180
1181 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1182 // 256i - 0 1 2 3 4 5 6 7
1183 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1184 }
1185
1186 // do the same for zw components
1187 Value *vi128ZW_lo = nullptr;
1188 Value *vi128ZW_hi = nullptr;
1189 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1190 {
1191 vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1192 vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1193 }
1194
1195 // init denormalize variables if needed
1196 Instruction::CastOps fpCast;
1197 Value *conversionFactor;
1198
1199 switch (conversionType)
1200 {
1201 case CONVERT_NORMALIZED:
1202 fpCast = Instruction::CastOps::SIToFP;
1203 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1204 break;
1205 case CONVERT_SSCALED:
1206 fpCast = Instruction::CastOps::SIToFP;
1207 conversionFactor = VIMMED1((float)(1.0));
1208 break;
1209 case CONVERT_USCALED:
1210 SWR_INVALID("Type should not be sign extended!");
1211 conversionFactor = nullptr;
1212 break;
1213 default:
1214 SWR_ASSERT(conversionType == CONVERT_NONE);
1215 conversionFactor = nullptr;
1216 break;
1217 }
1218
1219 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1220 for (uint32_t i = 0; i < 4; i++)
1221 {
1222 if (isComponentEnabled(compMask, i))
1223 {
1224 if (compCtrl[i] == ComponentControl::StoreSrc)
1225 {
1226 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1227 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1228 // if x or y, use vi128XY permute result, else use vi128ZW
1229 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1230 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1231
1232 // sign extend
1233 Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1234 Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1235
1236 Value* temp = JOIN_16(temp_lo, temp_hi);
1237
1238 // denormalize if needed
1239 if (conversionType != CONVERT_NONE)
1240 {
1241 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1242 }
1243
1244 vVertexElements[currentVertexElement] = temp;
1245
1246 currentVertexElement += 1;
1247 }
1248 else
1249 {
1250 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1251 }
1252
1253 if (currentVertexElement > 3)
1254 {
1255 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1256 // reset to the next vVertexElement to output
1257 currentVertexElement = 0;
1258 }
1259 }
1260 }
1261 }
1262 // else zero extend
1263 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1264 {
1265 // init denormalize variables if needed
1266 Instruction::CastOps fpCast;
1267 Value *conversionFactor;
1268
1269 switch (conversionType)
1270 {
1271 case CONVERT_NORMALIZED:
1272 fpCast = Instruction::CastOps::UIToFP;
1273 conversionFactor = VIMMED1((float)(1.0 / 255.0));
1274 break;
1275 case CONVERT_USCALED:
1276 fpCast = Instruction::CastOps::UIToFP;
1277 conversionFactor = VIMMED1((float)(1.0));
1278 break;
1279 case CONVERT_SSCALED:
1280 SWR_INVALID("Type should not be zero extended!");
1281 conversionFactor = nullptr;
1282 break;
1283 default:
1284 SWR_ASSERT(conversionType == CONVERT_NONE);
1285 conversionFactor = nullptr;
1286 break;
1287 }
1288
1289 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1290 for (uint32_t i = 0; i < 4; i++)
1291 {
1292 if (isComponentEnabled(compMask, i))
1293 {
1294 if (compCtrl[i] == ComponentControl::StoreSrc)
1295 {
1296 // pshufb masks for each component
1297 Value *vConstMask;
1298 switch (swizzle[i])
1299 {
1300 case 0:
1301 // x shuffle mask
1302 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1303 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1304 break;
1305 case 1:
1306 // y shuffle mask
1307 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1308 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1309 break;
1310 case 2:
1311 // z shuffle mask
1312 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1313 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1314 break;
1315 case 3:
1316 // w shuffle mask
1317 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1318 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1319 break;
1320 default:
1321 vConstMask = nullptr;
1322 break;
1323 }
1324
1325 Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1326 Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1327
1328 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1329 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1330
1331 // after pshufb for x channel
1332 // 256i - 0 1 2 3 4 5 6 7
1333 // x000 x000 x000 x000 x000 x000 x000 x000
1334
1335 Value* temp = JOIN_16(temp_lo, temp_hi);
1336
1337 // denormalize if needed
1338 if (conversionType != CONVERT_NONE)
1339 {
1340 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1341 }
1342
1343 vVertexElements[currentVertexElement] = temp;
1344
1345 currentVertexElement += 1;
1346 }
1347 else
1348 {
1349 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1350 }
1351
1352 if (currentVertexElement > 3)
1353 {
1354 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1355 // reset to the next vVertexElement to output
1356 currentVertexElement = 0;
1357 }
1358 }
1359 }
1360 }
1361 else
1362 {
1363 SWR_INVALID("Unsupported conversion type");
1364 }
1365 }
1366
1367 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1368 {
1369 // Unpack tuple args
1370 Value*& vGatherResult = std::get<0>(args);
1371 Value* pVtxOut = std::get<1>(args);
1372 const Instruction::CastOps extendType = std::get<2>(args);
1373 const ConversionType conversionType = std::get<3>(args);
1374 uint32_t &currentVertexElement = std::get<4>(args);
1375 uint32_t &outputElt = std::get<5>(args);
1376 const ComponentEnable compMask = std::get<6>(args);
1377 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1378 Value* (&vVertexElements)[4] = std::get<8>(args);
1379 const uint32_t(&swizzle)[4] = std::get<9>(args);
1380
1381 // cast types
1382 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1383
1384 for (uint32_t i = 0; i < 4; i++)
1385 {
1386 if (!isComponentEnabled(compMask, i))
1387 continue;
1388
1389 if (compCtrl[i] == ComponentControl::StoreSrc)
1390 {
1391 std::vector<uint32_t> vShuffleMasks[4] = {
1392 { 0, 4, 8, 12, 16, 20, 24, 28 }, // x
1393 { 1, 5, 9, 13, 17, 21, 25, 29 }, // y
1394 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
1395 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
1396 };
1397
1398 Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1399 UndefValue::get(v32x8Ty),
1400 vShuffleMasks[swizzle[i]]);
1401
1402 if ((extendType == Instruction::CastOps::SExt) ||
1403 (extendType == Instruction::CastOps::SIToFP)) {
1404 switch (conversionType)
1405 {
1406 case CONVERT_NORMALIZED:
1407 val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1408 break;
1409 case CONVERT_SSCALED:
1410 val = SI_TO_FP(val, mSimdFP32Ty);
1411 break;
1412 case CONVERT_USCALED:
1413 SWR_INVALID("Type should not be sign extended!");
1414 break;
1415 default:
1416 SWR_ASSERT(conversionType == CONVERT_NONE);
1417 val = S_EXT(val, mSimdInt32Ty);
1418 break;
1419 }
1420 }
1421 else if ((extendType == Instruction::CastOps::ZExt) ||
1422 (extendType == Instruction::CastOps::UIToFP)) {
1423 switch (conversionType)
1424 {
1425 case CONVERT_NORMALIZED:
1426 val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1427 break;
1428 case CONVERT_SSCALED:
1429 SWR_INVALID("Type should not be zero extended!");
1430 break;
1431 case CONVERT_USCALED:
1432 val = UI_TO_FP(val, mSimdFP32Ty);
1433 break;
1434 default:
1435 SWR_ASSERT(conversionType == CONVERT_NONE);
1436 val = Z_EXT(val, mSimdInt32Ty);
1437 break;
1438 }
1439 }
1440 else
1441 {
1442 SWR_INVALID("Unsupported conversion type");
1443 }
1444
1445 vVertexElements[currentVertexElement++] = val;
1446 }
1447 else
1448 {
1449 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1450 }
1451
1452 if (currentVertexElement > 3)
1453 {
1454 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1455 // reset to the next vVertexElement to output
1456 currentVertexElement = 0;
1457 }
1458 }
1459 }
1460
1461 //////////////////////////////////////////////////////////////////////////
1462 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1463 /// denormalizes if needed, converts to F32 if needed, and positions in
1464 // the proper SIMD rows to be output to the simdvertex structure
1465 /// @param args: (tuple of args, listed below)
1466 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1467 /// @param pVtxOut - base pointer to output simdvertex struct
1468 /// @param extendType - sign extend or zero extend
1469 /// @param bNormalized - do we need to denormalize?
1470 /// @param currentVertexElement - reference to the current vVertexElement
1471 /// @param outputElt - reference to the current offset from simdvertex we're o
1472 /// @param compMask - component packing mask
1473 /// @param compCtrl - component control val
1474 /// @param vVertexElements[4] - vertex components to output
1475 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
1476 {
1477 // Unpack tuple args
1478 Value* (&vGatherResult)[2] = std::get<0>(args);
1479 Value* pVtxOut = std::get<1>(args);
1480 const Instruction::CastOps extendType = std::get<2>(args);
1481 const ConversionType conversionType = std::get<3>(args);
1482 uint32_t &currentVertexElement = std::get<4>(args);
1483 uint32_t &outputElt = std::get<5>(args);
1484 const ComponentEnable compMask = std::get<6>(args);
1485 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1486 Value* (&vVertexElements)[4] = std::get<8>(args);
1487
1488 // cast types
1489 Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1490 Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1491
1492 // have to do extra work for sign extending
1493 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1494 {
1495 // is this PP float?
1496 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1497
1498 Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1499 Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1500
1501 // shuffle mask
1502 Value *vConstMask = C<uint8_t>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1503 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1504 Value *vi128XY_lo = nullptr;
1505 Value *vi128XY_hi = nullptr;
1506 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1507 {
1508 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1509
1510 Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1511 Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1512
1513 Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1514 Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1515
1516 // after pshufb: group components together in each 128bit lane
1517 // 256i - 0 1 2 3 4 5 6 7
1518 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1519
1520 vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1521 vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1522
1523 // after PERMD: move and pack xy components into each 128bit lane
1524 // 256i - 0 1 2 3 4 5 6 7
1525 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1526 }
1527
1528 // do the same for zw components
1529 Value *vi128ZW_lo = nullptr;
1530 Value *vi128ZW_hi = nullptr;
1531 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1532 {
1533 Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1534 Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1535
1536 Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1537 Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1538
1539 vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1540 vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1541 }
1542
1543 // init denormalize variables if needed
1544 Instruction::CastOps IntToFpCast;
1545 Value *conversionFactor;
1546
1547 switch (conversionType)
1548 {
1549 case CONVERT_NORMALIZED:
1550 IntToFpCast = Instruction::CastOps::SIToFP;
1551 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1552 break;
1553 case CONVERT_SSCALED:
1554 IntToFpCast = Instruction::CastOps::SIToFP;
1555 conversionFactor = VIMMED1((float)(1.0));
1556 break;
1557 case CONVERT_USCALED:
1558 SWR_INVALID("Type should not be sign extended!");
1559 conversionFactor = nullptr;
1560 break;
1561 default:
1562 SWR_ASSERT(conversionType == CONVERT_NONE);
1563 conversionFactor = nullptr;
1564 break;
1565 }
1566
1567 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1568 for (uint32_t i = 0; i < 4; i++)
1569 {
1570 if (isComponentEnabled(compMask, i))
1571 {
1572 if (compCtrl[i] == ComponentControl::StoreSrc)
1573 {
1574 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1575 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1576 // if x or y, use vi128XY permute result, else use vi128ZW
1577 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1578 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1579
1580 if (bFP)
1581 {
1582 // extract 128 bit lanes to sign extend each component
1583 Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1584 Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1585
1586 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1587 }
1588 else
1589 {
1590 // extract 128 bit lanes to sign extend each component
1591 Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1592 Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1593
1594 Value* temp = JOIN_16(temp_lo, temp_hi);
1595
1596 // denormalize if needed
1597 if (conversionType != CONVERT_NONE)
1598 {
1599 temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1600 }
1601
1602 vVertexElements[currentVertexElement] = temp;
1603 }
1604
1605 currentVertexElement += 1;
1606 }
1607 else
1608 {
1609 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1610 }
1611
1612 if (currentVertexElement > 3)
1613 {
1614 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1615 // reset to the next vVertexElement to output
1616 currentVertexElement = 0;
1617 }
1618 }
1619 }
1620 }
1621 // else zero extend
1622 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1623 {
1624 // pshufb masks for each component
1625 Value *vConstMask[2];
1626
1627 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1628 {
1629 // x/z shuffle mask
1630 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1631 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1632 }
1633
1634 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1635 {
1636 // y/w shuffle mask
1637 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1638 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1639 }
1640
1641 // init denormalize variables if needed
1642 Instruction::CastOps fpCast;
1643 Value* conversionFactor;
1644
1645 switch (conversionType)
1646 {
1647 case CONVERT_NORMALIZED:
1648 fpCast = Instruction::CastOps::UIToFP;
1649 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1650 break;
1651 case CONVERT_USCALED:
1652 fpCast = Instruction::CastOps::UIToFP;
1653 conversionFactor = VIMMED1((float)(1.0f));
1654 break;
1655 case CONVERT_SSCALED:
1656 SWR_INVALID("Type should not be zero extended!");
1657 conversionFactor = nullptr;
1658 break;
1659 default:
1660 SWR_ASSERT(conversionType == CONVERT_NONE);
1661 conversionFactor = nullptr;
1662 break;
1663 }
1664
1665 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1666 for (uint32_t i = 0; i < 4; i++)
1667 {
1668 if (isComponentEnabled(compMask, i))
1669 {
1670 if (compCtrl[i] == ComponentControl::StoreSrc)
1671 {
1672 // select correct constMask for x/z or y/w pshufb
1673 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1674 // if x or y, use vi128XY permute result, else use vi128ZW
1675 uint32_t selectedGather = (i < 2) ? 0 : 1;
1676
1677 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1678
1679 Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1680 Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1681
1682 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1683 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1684
1685 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1686 // 256i - 0 1 2 3 4 5 6 7
1687 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1688
1689 Value* temp = JOIN_16(temp_lo, temp_hi);
1690
1691 // denormalize if needed
1692 if (conversionType != CONVERT_NONE)
1693 {
1694 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1695 }
1696
1697 vVertexElements[currentVertexElement] = temp;
1698
1699 currentVertexElement += 1;
1700 }
1701 else
1702 {
1703 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1704 }
1705
1706 if (currentVertexElement > 3)
1707 {
1708 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1709 // reset to the next vVertexElement to output
1710 currentVertexElement = 0;
1711 }
1712 }
1713 }
1714 }
1715 else
1716 {
1717 SWR_INVALID("Unsupported conversion type");
1718 }
1719 }
1720
1721 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1722 {
1723 // Unpack tuple args
1724 Value* (&vGatherResult)[2] = std::get<0>(args);
1725 Value* pVtxOut = std::get<1>(args);
1726 const Instruction::CastOps extendType = std::get<2>(args);
1727 const ConversionType conversionType = std::get<3>(args);
1728 uint32_t &currentVertexElement = std::get<4>(args);
1729 uint32_t &outputElt = std::get<5>(args);
1730 const ComponentEnable compMask = std::get<6>(args);
1731 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1732 Value* (&vVertexElements)[4] = std::get<8>(args);
1733
1734 // cast types
1735 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1736 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1737
1738 // have to do extra work for sign extending
1739 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
1740 (extendType == Instruction::CastOps::FPExt))
1741 {
1742 // is this PP float?
1743 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1744
1745 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1746 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1747
1748 // shuffle mask
1749 Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1750 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1751 Value* vi128XY = nullptr;
1752 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
1753 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1754 // after pshufb: group components together in each 128bit lane
1755 // 256i - 0 1 2 3 4 5 6 7
1756 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1757
1758 vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1759 // after PERMD: move and pack xy components into each 128bit lane
1760 // 256i - 0 1 2 3 4 5 6 7
1761 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1762 }
1763
1764 // do the same for zw components
1765 Value* vi128ZW = nullptr;
1766 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
1767 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1768 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1769 }
1770
1771 // init denormalize variables if needed
1772 Instruction::CastOps IntToFpCast;
1773 Value* conversionFactor;
1774
1775 switch (conversionType)
1776 {
1777 case CONVERT_NORMALIZED:
1778 IntToFpCast = Instruction::CastOps::SIToFP;
1779 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1780 break;
1781 case CONVERT_SSCALED:
1782 IntToFpCast = Instruction::CastOps::SIToFP;
1783 conversionFactor = VIMMED1((float)(1.0));
1784 break;
1785 case CONVERT_USCALED:
1786 SWR_INVALID("Type should not be sign extended!");
1787 conversionFactor = nullptr;
1788 break;
1789 default:
1790 SWR_ASSERT(conversionType == CONVERT_NONE);
1791 conversionFactor = nullptr;
1792 break;
1793 }
1794
1795 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1796 for (uint32_t i = 0; i < 4; i++)
1797 {
1798 if (isComponentEnabled(compMask, i))
1799 {
1800 if (compCtrl[i] == ComponentControl::StoreSrc)
1801 {
1802 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1803 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1804 // if x or y, use vi128XY permute result, else use vi128ZW
1805 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1806
1807 if (bFP) {
1808 // extract 128 bit lanes to sign extend each component
1809 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1810 }
1811 else {
1812 // extract 128 bit lanes to sign extend each component
1813 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1814
1815 // denormalize if needed
1816 if (conversionType != CONVERT_NONE) {
1817 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1818 }
1819 }
1820 currentVertexElement++;
1821 }
1822 else
1823 {
1824 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1825 }
1826
1827 if (currentVertexElement > 3)
1828 {
1829 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1830 // reset to the next vVertexElement to output
1831 currentVertexElement = 0;
1832 }
1833 }
1834 }
1835 }
1836 // else zero extend
1837 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1838 {
1839 // pshufb masks for each component
1840 Value* vConstMask[2];
1841 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
1842 // x/z shuffle mask
1843 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1844 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1845 }
1846
1847 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
1848 // y/w shuffle mask
1849 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1850 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1851 }
1852
1853 // init denormalize variables if needed
1854 Instruction::CastOps fpCast;
1855 Value* conversionFactor;
1856
1857 switch (conversionType)
1858 {
1859 case CONVERT_NORMALIZED:
1860 fpCast = Instruction::CastOps::UIToFP;
1861 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1862 break;
1863 case CONVERT_USCALED:
1864 fpCast = Instruction::CastOps::UIToFP;
1865 conversionFactor = VIMMED1((float)(1.0f));
1866 break;
1867 case CONVERT_SSCALED:
1868 SWR_INVALID("Type should not be zero extended!");
1869 conversionFactor = nullptr;
1870 break;
1871 default:
1872 SWR_ASSERT(conversionType == CONVERT_NONE);
1873 conversionFactor = nullptr;
1874 break;
1875 }
1876
1877 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1878 for (uint32_t i = 0; i < 4; i++)
1879 {
1880 if (isComponentEnabled(compMask, i))
1881 {
1882 if (compCtrl[i] == ComponentControl::StoreSrc)
1883 {
1884 // select correct constMask for x/z or y/w pshufb
1885 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1886 // if x or y, use vi128XY permute result, else use vi128ZW
1887 uint32_t selectedGather = (i < 2) ? 0 : 1;
1888
1889 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1890 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1891 // 256i - 0 1 2 3 4 5 6 7
1892 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1893
1894 // denormalize if needed
1895 if (conversionType != CONVERT_NONE)
1896 {
1897 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1898 }
1899 currentVertexElement++;
1900 }
1901 else
1902 {
1903 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1904 }
1905
1906 if (currentVertexElement > 3)
1907 {
1908 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1909 // reset to the next vVertexElement to output
1910 currentVertexElement = 0;
1911 }
1912 }
1913 }
1914 }
1915 else
1916 {
1917 SWR_INVALID("Unsupported conversion type");
1918 }
1919 }
1920
1921 //////////////////////////////////////////////////////////////////////////
1922 /// @brief Output a simdvertex worth of elements to the current outputElt
1923 /// @param pVtxOut - base address of VIN output struct
1924 /// @param outputElt - simdvertex offset in VIN to write to
1925 /// @param numEltsToStore - number of simdvertex rows to write out
1926 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1927 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1928 {
1929 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1930
1931 for (uint32_t c = 0; c < numEltsToStore; ++c)
1932 {
1933 // STORE expects FP32 x vWidth type, just bitcast if needed
1934 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
1935 {
1936 #if FETCH_DUMP_VERTEX
1937 PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
1938 #endif
1939 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1940 }
1941 #if FETCH_DUMP_VERTEX
1942 else
1943 {
1944 PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
1945 }
1946 #endif
1947 // outputElt * 4 = offsetting by the size of a simdvertex
1948 // + c offsets to a 32bit x vWidth row within the current vertex
1949 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
1950 STORE(vVertexElements[c], dest);
1951 }
1952 }
1953
1954 //////////////////////////////////////////////////////////////////////////
1955 /// @brief Generates a constant vector of values based on the
1956 /// ComponentControl value
1957 /// @param ctrl - ComponentControl value
1958 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1959 {
1960 switch (ctrl)
1961 {
1962 case NoStore:
1963 return VUNDEF_I();
1964 case Store0:
1965 return VIMMED1(0);
1966 case Store1Fp:
1967 return VIMMED1(1.0f);
1968 case Store1Int:
1969 return VIMMED1(1);
1970 case StoreVertexId:
1971 {
1972 if (mVWidth == 16)
1973 {
1974 Type* pSimd8FPTy = VectorType::get(mFP32Ty, 8);
1975 Value *pIdLo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), pSimd8FPTy);
1976 Value *pIdHi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), pSimd8FPTy);
1977 return JOIN_16(pIdLo, pIdHi);
1978 }
1979 else
1980 {
1981 return BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1982 }
1983 }
1984 case StoreInstanceId:
1985 {
1986 Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1987 return VBROADCAST(pId);
1988 }
1989
1990
1991 case StoreSrc:
1992 default:
1993 SWR_INVALID("Invalid component control");
1994 return VUNDEF_I();
1995 }
1996 }
1997
1998 //////////////////////////////////////////////////////////////////////////
1999 /// @brief Returns the enable mask for the specified component.
2000 /// @param enableMask - enable bits
2001 /// @param component - component to check if enabled.
2002 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2003 {
2004 switch (component)
2005 {
2006 // X
2007 case 0: return (enableMask & ComponentEnable::X);
2008 // Y
2009 case 1: return (enableMask & ComponentEnable::Y);
2010 // Z
2011 case 2: return (enableMask & ComponentEnable::Z);
2012 // W
2013 case 3: return (enableMask & ComponentEnable::W);
2014
2015 default: return false;
2016 }
2017 }
2018
2019 // Don't want two threads compiling the same fetch shader simultaneously
2020 // Has problems in the JIT cache implementation
2021 // This is only a problem for fetch right now.
2022 static std::mutex gFetchCodegenMutex;
2023
2024 //////////////////////////////////////////////////////////////////////////
2025 /// @brief JITs from fetch shader IR
2026 /// @param hJitMgr - JitManager handle
2027 /// @param func - LLVM function IR
2028 /// @return PFN_FETCH_FUNC - pointer to fetch code
2029 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2030 {
2031 const llvm::Function* func = (const llvm::Function*)hFunc;
2032 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2033 PFN_FETCH_FUNC pfnFetch;
2034
2035 gFetchCodegenMutex.lock();
2036 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2037 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2038 pJitMgr->mIsModuleFinalized = true;
2039
2040 #if defined(KNOB_SWRC_TRACING)
2041 char fName[1024];
2042 const char *funcName = func->getName().data();
2043 sprintf(fName, "%s.bin", funcName);
2044 FILE *fd = fopen(fName, "wb");
2045 fwrite((void *)pfnFetch, 1, 2048, fd);
2046 fclose(fd);
2047 #endif
2048
2049 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2050 gFetchCodegenMutex.unlock();
2051
2052
2053
2054 return pfnFetch;
2055 }
2056
2057 //////////////////////////////////////////////////////////////////////////
2058 /// @brief JIT compiles fetch shader
2059 /// @param hJitMgr - JitManager handle
2060 /// @param state - fetch state to build function from
2061 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2062 {
2063 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2064
2065 pJitMgr->SetupNewModule();
2066
2067 FetchJit theJit(pJitMgr);
2068 HANDLE hFunc = theJit.Create(state);
2069
2070 return JitFetchFunc(hJitMgr, hFunc);
2071 }