swr/rast: Enable generalized fetch jit
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder_gfx_mem.h"
32 #include "jit_api.h"
33 #include "fetch_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
36
37 //#define FETCH_DUMP_VERTEX 1
38 using namespace llvm;
39 using namespace SwrJit;
40
41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43 enum ConversionType
44 {
45 CONVERT_NONE,
46 CONVERT_NORMALIZED,
47 CONVERT_USCALED,
48 CONVERT_SSCALED,
49 CONVERT_SFIXED,
50 };
51
52 //////////////////////////////////////////////////////////////////////////
53 /// Interface to Jitting a fetch shader
54 //////////////////////////////////////////////////////////////////////////
55 struct FetchJit : public BuilderGfxMem
56 {
57 FetchJit(JitManager* pJitMgr) :
58 BuilderGfxMem(pJitMgr)
59 {}
60
61 Function* Create(const FETCH_COMPILE_STATE& fetchState);
62
63 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
64 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
65 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
66
67 // package up Shuffle*bpcGatherd args into a tuple for convenience
68 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
69 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
70 const uint32_t(&)[4]> Shuffle8bpcArgs;
71
72 void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
73 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
74
75 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
76 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
77
78 void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
79 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
80
81 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
82
83 Value *GenerateCompCtrlVector(const ComponentControl ctrl);
84
85 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
86
87 bool IsOddFormat(SWR_FORMAT format);
88 bool IsUniformFormat(SWR_FORMAT format);
89 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
90 void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
91 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
92
93 Value* mpFetchInfo;
94 };
95
96 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
97 {
98 std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
99 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
100
101 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
102 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
103
104 fetch->getParent()->setModuleIdentifier(fetch->getName());
105
106 IRB()->SetInsertPoint(entry);
107
108 auto argitr = fetch->arg_begin();
109
110 // Fetch shader arguments
111 Value* privateContext = &*argitr; ++argitr;
112 privateContext->setName("privateContext");
113 SetPrivateContext(privateContext);
114
115 mpFetchInfo = &*argitr; ++argitr;
116 mpFetchInfo->setName("fetchInfo");
117 Value* pVtxOut = &*argitr;
118 pVtxOut->setName("vtxOutput");
119
120 uint32_t baseWidth = mVWidth;
121
122 SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
123
124 // Override builder target width to force 16-wide SIMD
125 #if USE_SIMD16_SHADERS
126 SetTargetWidth(16);
127 #endif
128
129 pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
130
131 // SWR_FETCH_CONTEXT::pStreams
132 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
133 streams->setName("pStreams");
134
135 // SWR_FETCH_CONTEXT::pIndices
136 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
137 indices->setName("pIndices");
138
139 // SWR_FETCH_CONTEXT::pLastIndex
140 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
141 pLastIndex->setName("pLastIndex");
142
143 Value* vIndices;
144 switch(fetchState.indexType)
145 {
146 case R8_UINT:
147 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
148 if(fetchState.bDisableIndexOOBCheck)
149 {
150 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
151 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
152 }
153 else
154 {
155 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
156 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
157 }
158 break;
159 case R16_UINT:
160 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
161 if(fetchState.bDisableIndexOOBCheck)
162 {
163 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
164 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
165 }
166 else
167 {
168 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
169 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
170 }
171 break;
172 case R32_UINT:
173 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
174 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
175 break; // incoming type is already 32bit int
176 default:
177 SWR_INVALID("Unsupported index type");
178 vIndices = nullptr;
179 break;
180 }
181
182 if(fetchState.bForceSequentialAccessEnable)
183 {
184 Value* pOffsets = mVWidth == 8 ? C({ 0, 1, 2, 3, 4, 5, 6, 7 }) :
185 C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
186
187 // VertexData buffers are accessed sequentially, the index is equal to the vertex number
188 vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
189 vIndices = ADD(vIndices, pOffsets);
190 }
191
192 Value* vVertexId = vIndices;
193 if (fetchState.bVertexIDOffsetEnable)
194 {
195 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
196 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
197 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
198 vVertexId = ADD(vIndices, vBaseVertex);
199 vVertexId = ADD(vVertexId, vStartVertex);
200 }
201
202 // store out vertex IDs
203 if (mVWidth == 16)
204 {
205 // store out in simd8 halves until core supports 16-wide natively
206 auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
207 auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
208 STORE(vVertexIdLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
209 STORE(vVertexIdHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
210 }
211 else if (mVWidth == 8)
212 {
213 STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
214 }
215
216 // store out cut mask if enabled
217 if (fetchState.bEnableCutIndex)
218 {
219 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
220 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
221
222 if (mVWidth == 16)
223 {
224 auto cutMaskLo = EXTRACT_16(cutMask, 0);
225 auto cutMaskHi = EXTRACT_16(cutMask, 1);
226 STORE(cutMaskLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
227 STORE(cutMaskHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
228 }
229 else if (mVWidth == 8)
230 {
231 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
232 }
233 }
234
235 // Fetch attributes from memory and output to a simdvertex struct
236 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
237 JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
238
239 RET_VOID();
240
241 JitManager::DumpToFile(fetch, "src");
242
243 #if defined(_DEBUG)
244 verifyFunction(*fetch);
245 #endif
246
247 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
248
249 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
250 setupPasses.add(createBreakCriticalEdgesPass());
251 setupPasses.add(createCFGSimplificationPass());
252 setupPasses.add(createEarlyCSEPass());
253 setupPasses.add(createPromoteMemoryToRegisterPass());
254
255 setupPasses.run(*fetch);
256
257 JitManager::DumpToFile(fetch, "se");
258
259 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
260
261 ///@todo Haven't touched these either. Need to remove some of these and add others.
262 optPasses.add(createCFGSimplificationPass());
263 optPasses.add(createEarlyCSEPass());
264 optPasses.add(createInstructionCombiningPass());
265 optPasses.add(createInstructionSimplifierPass());
266 optPasses.add(createConstantPropagationPass());
267 optPasses.add(createSCCPPass());
268 optPasses.add(createAggressiveDCEPass());
269
270 optPasses.run(*fetch);
271
272 optPasses.add(createLowerX86Pass(JM(), this));
273 optPasses.run(*fetch);
274
275 JitManager::DumpToFile(fetch, "opt");
276
277
278 // Revert 16-wide override
279 #if USE_SIMD16_SHADERS
280 SetTargetWidth(baseWidth);
281 #endif
282
283 return fetch;
284 }
285
286 // returns true for odd formats that require special state.gather handling
287 bool FetchJit::IsOddFormat(SWR_FORMAT format)
288 {
289 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
290 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
291 {
292 return true;
293 }
294 return false;
295 }
296
297 // format is uniform if all components are the same size and type
298 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
299 {
300 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
301 uint32_t bpc0 = info.bpc[0];
302 uint32_t type0 = info.type[0];
303
304 for (uint32_t c = 1; c < info.numComps; ++c)
305 {
306 if (bpc0 != info.bpc[c] || type0 != info.type[c])
307 {
308 return false;
309 }
310 }
311 return true;
312 }
313
314 // unpacks components based on format
315 // foreach component in the pixel
316 // mask off everything but this component
317 // shift component to LSB
318 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
319 {
320 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
321
322 uint32_t bitOffset = 0;
323 for (uint32_t c = 0; c < info.numComps; ++c)
324 {
325 uint32_t swizzledIndex = info.swizzle[c];
326 uint32_t compBits = info.bpc[c];
327 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
328 Value* comp = AND(vInput, bitmask);
329 comp = LSHR(comp, bitOffset);
330
331 result[swizzledIndex] = comp;
332 bitOffset += compBits;
333 }
334 }
335
336 // gather for odd component size formats
337 // gather SIMD full pixels per lane then shift/mask to move each component to their
338 // own vector
339 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
340 {
341 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
342
343 // only works if pixel size is <= 32bits
344 SWR_ASSERT(info.bpp <= 32);
345
346 Value *pGather;
347 if (info.bpp == 32)
348 {
349 pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
350 }
351 else
352 {
353 // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
354 Value *pMem = ALLOCA(mSimdInt32Ty);
355 STORE(VIMMED1(0u), pMem);
356
357 pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
358 Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
359
360 for (uint32_t lane = 0; lane < mVWidth; ++lane)
361 {
362 // Get index
363 Value* index = VEXTRACT(pOffsets, C(lane));
364 Value* mask = VEXTRACT(pMask, C(lane));
365 switch (info.bpp)
366 {
367 case 8:
368 {
369 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
370 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
371 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
372 break;
373 }
374
375 case 16:
376 {
377 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
378 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
379 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
380 break;
381 }
382 break;
383
384 case 24:
385 {
386 // First 16-bits of data
387 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
388 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
389 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
390
391 // Last 8-bits of data
392 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
393 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
394 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
395 break;
396 }
397
398 default:
399 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
400 break;
401 }
402 }
403
404 pGather = LOAD(pMem);
405 }
406
407 for (uint32_t comp = 0; comp < 4; ++comp)
408 {
409 pResult[comp] = VIMMED1((int)info.defaults[comp]);
410 }
411
412 UnpackComponents(format, pGather, pResult);
413
414 // cast to fp32
415 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
416 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
417 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
418 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
419 }
420
421 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
422 {
423 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
424
425 for (uint32_t c = 0; c < info.numComps; ++c)
426 {
427 uint32_t compIndex = info.swizzle[c];
428
429 // skip any conversion on UNUSED components
430 if (info.type[c] == SWR_TYPE_UNUSED)
431 {
432 continue;
433 }
434
435 if (info.isNormalized[c])
436 {
437 if (info.type[c] == SWR_TYPE_SNORM)
438 {
439 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
440
441 /// result = c * (1.0f / (2^(n-1) - 1);
442 uint32_t n = info.bpc[c];
443 uint32_t pow2 = 1 << (n - 1);
444 float scale = 1.0f / (float)(pow2 - 1);
445 Value *vScale = VIMMED1(scale);
446 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
447 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
448 texels[compIndex] = FMUL(texels[compIndex], vScale);
449 }
450 else
451 {
452 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
453
454 /// result = c * (1.0f / (2^n - 1))
455 uint32_t n = info.bpc[c];
456 uint32_t pow2 = 1 << n;
457 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
458 if (n == 24)
459 {
460 float scale = (float)(pow2 - 1);
461 Value* vScale = VIMMED1(scale);
462 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
463 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
464 texels[compIndex] = FDIV(texels[compIndex], vScale);
465 }
466 else
467 {
468 float scale = 1.0f / (float)(pow2 - 1);
469 Value *vScale = VIMMED1(scale);
470 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
471 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
472 texels[compIndex] = FMUL(texels[compIndex], vScale);
473 }
474 }
475 continue;
476 }
477 }
478 }
479
480 //////////////////////////////////////////////////////////////////////////
481 /// @brief Loads attributes from memory using AVX2 GATHER(s)
482 /// @param fetchState - info about attributes to be fetched from memory
483 /// @param streams - value pointer to the current vertex stream
484 /// @param vIndices - vector value of indices to gather
485 /// @param pVtxOut - value pointer to output simdvertex struct
486 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
487 Value* streams, Value* vIndices, Value* pVtxOut)
488 {
489 uint32_t currentVertexElement = 0;
490 uint32_t outputElt = 0;
491 Value* vVertexElements[4];
492
493 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
494 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
495 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
496 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
497 curInstance->setName("curInstance");
498
499 for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
500 {
501 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
502
503 // skip element if all components are disabled
504 if (ied.ComponentPacking == ComponentEnable::NONE)
505 {
506 continue;
507 }
508
509 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
510 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
511 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
512
513 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
514
515 // VGATHER* takes an *i8 src pointer
516 Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
517
518 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
519 Value *vStride = VBROADCAST(stride);
520
521 // max vertex index that is fully in bounds
522 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
523 maxVertex = LOAD(maxVertex);
524
525 Value *minVertex = NULL;
526 if (fetchState.bPartialVertexBuffer)
527 {
528 // min vertex index for low bounds OOB checking
529 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
530 minVertex = LOAD(minVertex);
531 }
532
533 if (fetchState.bInstanceIDOffsetEnable)
534 {
535 // the InstanceID (curInstance) value is offset by StartInstanceLocation
536 curInstance = ADD(curInstance, startInstance);
537 }
538
539 Value *vCurIndices;
540 Value *startOffset;
541 Value *vInstanceStride = VIMMED1(0);
542
543 if (ied.InstanceEnable)
544 {
545 Value* stepRate = C(ied.InstanceAdvancementState);
546
547 // prevent a div by 0 for 0 step rate
548 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
549 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
550
551 // calc the current offset into instanced data buffer
552 Value* calcInstance = UDIV(curInstance, stepRate);
553
554 // if step rate is 0, every instance gets instance 0
555 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
556
557 vCurIndices = VBROADCAST(calcInstance);
558 startOffset = startInstance;
559 }
560 else if (ied.InstanceStrideEnable)
561 {
562 // grab the instance advancement state, determines stride in bytes from one instance to the next
563 Value* stepRate = C(ied.InstanceAdvancementState);
564 vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
565
566 // offset indices by baseVertex
567 vCurIndices = ADD(vIndices, vBaseVertex);
568
569 startOffset = startVertex;
570 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
571 }
572 else
573 {
574 // offset indices by baseVertex
575 vCurIndices = ADD(vIndices, vBaseVertex);
576 startOffset = startVertex;
577 }
578
579 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
580 // do 64bit address offset calculations.
581
582 // calculate byte offset to the start of the VB
583 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
584 pStreamBase = GEP(pStreamBase, baseOffset);
585 Value* pStreamBaseGFX = ADD(stream, baseOffset);
586
587 // if we have a start offset, subtract from max vertex. Used for OOB check
588 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
589 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
590 // if we have a negative value, we're already OOB. clamp at 0.
591 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
592
593 if (fetchState.bPartialVertexBuffer)
594 {
595 // similary for min vertex
596 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
597 Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
598 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
599 }
600
601 // Load the in bounds size of a partially valid vertex
602 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
603 partialInboundsSize = LOAD(partialInboundsSize);
604 Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
605 Value *vBpp = VBROADCAST(C(info.Bpp));
606 Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
607
608 // is the element is <= the partially valid size
609 Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
610
611 // override cur indices with 0 if pitch is 0
612 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
613 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
614
615 // are vertices partially OOB?
616 Value* vMaxVertex = VBROADCAST(maxVertex);
617 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
618
619 // are vertices fully in bounds?
620 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
621
622 Value *vGatherMask;
623 if (fetchState.bPartialVertexBuffer)
624 {
625 // are vertices below minVertex limit?
626 Value *vMinVertex = VBROADCAST(minVertex);
627 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
628
629 // only fetch lanes that pass both tests
630 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
631 }
632 else
633 {
634 vGatherMask = vMaxGatherMask;
635 }
636
637 // blend in any partially OOB indices that have valid elements
638 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
639
640 // calculate the actual offsets into the VB
641 Value* vOffsets = MUL(vCurIndices, vStride);
642 vOffsets = ADD(vOffsets, vAlignmentOffsets);
643
644 // if instance stride enable is:
645 // true - add product of the instanceID and advancement state to the offst into the VB
646 // false - value of vInstanceStride has been initialialized to zero
647 vOffsets = ADD(vOffsets, vInstanceStride);
648
649 // Packing and component control
650 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
651 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
652 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
653
654 // Special gather/conversion for formats without equal component sizes
655 if (IsOddFormat((SWR_FORMAT)ied.Format))
656 {
657 Value *pResults[4];
658 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
659 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
660
661 for (uint32_t c = 0; c < 4; c += 1)
662 {
663 if (isComponentEnabled(compMask, c))
664 {
665 vVertexElements[currentVertexElement++] = pResults[c];
666 if (currentVertexElement > 3)
667 {
668 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
669 // reset to the next vVertexElement to output
670 currentVertexElement = 0;
671 }
672 }
673 }
674 }
675 else if(info.type[0] == SWR_TYPE_FLOAT)
676 {
677 ///@todo: support 64 bit vb accesses
678 Value *gatherSrc = VIMMED1(0.0f);
679
680 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
681 "Unsupported format for standard gather fetch.");
682
683 // Gather components from memory to store in a simdvertex structure
684 switch (bpc)
685 {
686 case 16:
687 {
688 Value *vGatherResult[2];
689
690 // if we have at least one component out of x or y to fetch
691 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
692 {
693 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
694 // e.g. result of first 8x32bit integer gather for 16bit components
695 // 256i - 0 1 2 3 4 5 6 7
696 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
697 //
698 }
699
700 // if we have at least one component out of z or w to fetch
701 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
702 {
703 // offset base to the next components(zw) in the vertex to gather
704 pStreamBase = GEP(pStreamBase, C((char)4));
705
706 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
707 // e.g. result of second 8x32bit integer gather for 16bit components
708 // 256i - 0 1 2 3 4 5 6 7
709 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
710 //
711 }
712
713 // if we have at least one component to shuffle into place
714 if (compMask)
715 {
716 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
717 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
718
719 // Shuffle gathered components into place in simdvertex struct
720 mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref
721 }
722 }
723 break;
724 case 32:
725 {
726 for (uint32_t i = 0; i < 4; i += 1)
727 {
728 if (isComponentEnabled(compMask, i))
729 {
730 // if we need to gather the component
731 if (compCtrl[i] == StoreSrc)
732 {
733 // Gather a SIMD of vertices
734 // APIs allow a 4GB range for offsets
735 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
736 // But, we know that elements must be aligned for FETCH. :)
737 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
738 Value *vShiftedOffsets = LSHR(vOffsets, 1);
739 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2, GFX_MEM_CLIENT_FETCH);
740 }
741 else
742 {
743 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
744 }
745
746 if (currentVertexElement > 3)
747 {
748 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
749 // reset to the next vVertexElement to output
750 currentVertexElement = 0;
751 }
752 }
753
754 // offset base to the next component in the vertex to gather
755 pStreamBase = GEP(pStreamBase, C((char)4));
756 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
757 }
758 }
759 break;
760 case 64:
761 {
762 for (uint32_t i = 0; i < 4; i += 1)
763 {
764 if (isComponentEnabled(compMask, i))
765 {
766 // if we need to gather the component
767 if (compCtrl[i] == StoreSrc)
768 {
769 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
770 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
771
772 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
773 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
774
775 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
776
777 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
778 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
779
780 pGatherLo = VCVTPD2PS(pGatherLo);
781 pGatherHi = VCVTPD2PS(pGatherHi);
782
783 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
784
785 vVertexElements[currentVertexElement++] = pGather;
786 }
787 else
788 {
789 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
790 }
791
792 if (currentVertexElement > 3)
793 {
794 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
795 // reset to the next vVertexElement to output
796 currentVertexElement = 0;
797 }
798 }
799
800 // offset base to the next component in the vertex to gather
801 pStreamBase = GEP(pStreamBase, C((char)8));
802 }
803 }
804 break;
805 default:
806 SWR_INVALID("Tried to fetch invalid FP format");
807 break;
808 }
809 }
810 else
811 {
812 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
813 ConversionType conversionType = CONVERT_NONE;
814
815 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
816 "Unsupported format for standard gather fetch.");
817
818 switch(info.type[0])
819 {
820 case SWR_TYPE_UNORM:
821 conversionType = CONVERT_NORMALIZED;
822 case SWR_TYPE_UINT:
823 extendCastType = Instruction::CastOps::ZExt;
824 break;
825 case SWR_TYPE_SNORM:
826 conversionType = CONVERT_NORMALIZED;
827 case SWR_TYPE_SINT:
828 extendCastType = Instruction::CastOps::SExt;
829 break;
830 case SWR_TYPE_USCALED:
831 conversionType = CONVERT_USCALED;
832 extendCastType = Instruction::CastOps::UIToFP;
833 break;
834 case SWR_TYPE_SSCALED:
835 conversionType = CONVERT_SSCALED;
836 extendCastType = Instruction::CastOps::SIToFP;
837 break;
838 case SWR_TYPE_SFIXED:
839 conversionType = CONVERT_SFIXED;
840 extendCastType = Instruction::CastOps::SExt;
841 break;
842 default:
843 break;
844 }
845
846 // value substituted when component of gather is masked
847 Value* gatherSrc = VIMMED1(0);
848
849 // Gather components from memory to store in a simdvertex structure
850 switch (bpc)
851 {
852 case 8:
853 {
854 // if we have at least one component to fetch
855 if (compMask)
856 {
857 Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
858 // e.g. result of an 8x32bit integer gather for 8bit components
859 // 256i - 0 1 2 3 4 5 6 7
860 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
861
862 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
863 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
864
865 // Shuffle gathered components into place in simdvertex struct
866 mVWidth == 16 ? Shuffle8bpcGatherd16(args) : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
867 }
868 }
869 break;
870 case 16:
871 {
872 Value *vGatherResult[2];
873
874 // if we have at least one component out of x or y to fetch
875 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
876 {
877 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
878 // e.g. result of first 8x32bit integer gather for 16bit components
879 // 256i - 0 1 2 3 4 5 6 7
880 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
881 //
882 }
883
884 // if we have at least one component out of z or w to fetch
885 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
886 {
887 // offset base to the next components(zw) in the vertex to gather
888 pStreamBase = GEP(pStreamBase, C((char)4));
889
890 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
891 // e.g. result of second 8x32bit integer gather for 16bit components
892 // 256i - 0 1 2 3 4 5 6 7
893 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
894 //
895 }
896
897 // if we have at least one component to shuffle into place
898 if (compMask)
899 {
900 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
901 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
902
903 // Shuffle gathered components into place in simdvertex struct
904 mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref
905 }
906 }
907 break;
908 case 32:
909 {
910 // Gathered components into place in simdvertex struct
911 for (uint32_t i = 0; i < 4; i++)
912 {
913 if (isComponentEnabled(compMask, i))
914 {
915 // if we need to gather the component
916 if (compCtrl[i] == StoreSrc)
917 {
918 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
919
920 if (conversionType == CONVERT_USCALED)
921 {
922 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
923 }
924 else if (conversionType == CONVERT_SSCALED)
925 {
926 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
927 }
928 else if (conversionType == CONVERT_SFIXED)
929 {
930 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
931 }
932
933 vVertexElements[currentVertexElement++] = pGather;
934
935 // e.g. result of a single 8x32bit integer gather for 32bit components
936 // 256i - 0 1 2 3 4 5 6 7
937 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
938 }
939 else
940 {
941 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
942 }
943
944 if (currentVertexElement > 3)
945 {
946 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
947
948 // reset to the next vVertexElement to output
949 currentVertexElement = 0;
950 }
951
952 }
953
954 // offset base to the next component in the vertex to gather
955 pStreamBase = GEP(pStreamBase, C((char)4));
956 }
957 }
958 break;
959 }
960 }
961 }
962
963 // if we have a partially filled vVertexElement struct, output it
964 if (currentVertexElement > 0)
965 {
966 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
967 }
968 }
969
970 //////////////////////////////////////////////////////////////////////////
971 /// @brief Loads a simd of valid indices. OOB indices are set to 0
972 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
973 /// support
974 /// @param pIndices - pointer to 8 bit indices
975 /// @param pLastIndex - pointer to last valid index
976 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
977 {
978 // can fit 2 16 bit integers per vWidth lane
979 Value* vIndices = VUNDEF_I();
980
981 // store 0 index on stack to be used to conditionally load from if index address is OOB
982 Value* pZeroIndex = ALLOCA(mInt8Ty);
983 STORE(C((uint8_t)0), pZeroIndex);
984
985 // Load a SIMD of index pointers
986 for(int64_t lane = 0; lane < mVWidth; lane++)
987 {
988 // Calculate the address of the requested index
989 Value *pIndex = GEP(pIndices, C(lane));
990
991 // check if the address is less than the max index,
992 Value* mask = ICMP_ULT(pIndex, pLastIndex);
993
994 // if valid, load the index. if not, load 0 from the stack
995 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
996 Value *index = LOAD(pValid, "valid index");
997
998 // zero extended index to 32 bits and insert into the correct simd lane
999 index = Z_EXT(index, mInt32Ty);
1000 vIndices = VINSERT(vIndices, index, lane);
1001 }
1002 return vIndices;
1003 }
1004
1005 //////////////////////////////////////////////////////////////////////////
1006 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1007 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1008 /// support
1009 /// @param pIndices - pointer to 16 bit indices
1010 /// @param pLastIndex - pointer to last valid index
1011 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1012 {
1013 // can fit 2 16 bit integers per vWidth lane
1014 Value* vIndices = VUNDEF_I();
1015
1016 // store 0 index on stack to be used to conditionally load from if index address is OOB
1017 Value* pZeroIndex = ALLOCA(mInt16Ty);
1018 STORE(C((uint16_t)0), pZeroIndex);
1019
1020 // Load a SIMD of index pointers
1021 for(int64_t lane = 0; lane < mVWidth; lane++)
1022 {
1023 // Calculate the address of the requested index
1024 Value *pIndex = GEP(pIndices, C(lane));
1025
1026 // check if the address is less than the max index,
1027 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1028
1029 // if valid, load the index. if not, load 0 from the stack
1030 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1031 Value *index = LOAD(pValid, "valid index", GFX_MEM_CLIENT_FETCH);
1032
1033 // zero extended index to 32 bits and insert into the correct simd lane
1034 index = Z_EXT(index, mInt32Ty);
1035 vIndices = VINSERT(vIndices, index, lane);
1036 }
1037 return vIndices;
1038 }
1039
1040 //////////////////////////////////////////////////////////////////////////
1041 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1042 /// @param pIndices - pointer to 32 bit indices
1043 /// @param pLastIndex - pointer to last valid index
1044 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1045 {
1046 DataLayout dL(JM()->mpCurrentModule);
1047 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1048 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1049 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1050
1051 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1052 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1053 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1054 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1055
1056 // create a vector of index counts from the base index ptr passed into the fetch
1057 Constant* vIndexOffsets;
1058 if (mVWidth == 8)
1059 {
1060 vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
1061 }
1062 else
1063 {
1064 vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
1065 }
1066
1067 // compare index count to the max valid index
1068 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1069 // vIndexOffsets 0 1 2 3 4 5 6 7
1070 // ------------------------------
1071 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1072 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1073 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1074 Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1075
1076 // Load the indices; OOB loads 0
1077 pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0));
1078 return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0));
1079 }
1080
1081 //////////////////////////////////////////////////////////////////////////
1082 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1083 /// denormalizes if needed, converts to F32 if needed, and positions in
1084 // the proper SIMD rows to be output to the simdvertex structure
1085 /// @param args: (tuple of args, listed below)
1086 /// @param vGatherResult - 8 gathered 8bpc vertices
1087 /// @param pVtxOut - base pointer to output simdvertex struct
1088 /// @param extendType - sign extend or zero extend
1089 /// @param bNormalized - do we need to denormalize?
1090 /// @param currentVertexElement - reference to the current vVertexElement
1091 /// @param outputElt - reference to the current offset from simdvertex we're o
1092 /// @param compMask - component packing mask
1093 /// @param compCtrl - component control val
1094 /// @param vVertexElements[4] - vertex components to output
1095 /// @param swizzle[4] - component swizzle location
1096 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
1097 {
1098 // Unpack tuple args
1099 Value*& vGatherResult = std::get<0>(args);
1100 Value* pVtxOut = std::get<1>(args);
1101 const Instruction::CastOps extendType = std::get<2>(args);
1102 const ConversionType conversionType = std::get<3>(args);
1103 uint32_t &currentVertexElement = std::get<4>(args);
1104 uint32_t &outputElt = std::get<5>(args);
1105 const ComponentEnable compMask = std::get<6>(args);
1106 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1107 Value* (&vVertexElements)[4] = std::get<8>(args);
1108 const uint32_t(&swizzle)[4] = std::get<9>(args);
1109
1110 // cast types
1111 Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1112 Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1113
1114 // have to do extra work for sign extending
1115 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1116 {
1117 Type *v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1118 Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1119
1120 // shuffle mask, including any swizzling
1121 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1122 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1123 Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
1124 char(y), char(y + 4), char(y + 8), char(y + 12),
1125 char(z), char(z + 4), char(z + 8), char(z + 12),
1126 char(w), char(w + 4), char(w + 8), char(w + 12),
1127 char(x), char(x + 4), char(x + 8), char(x + 12),
1128 char(y), char(y + 4), char(y + 8), char(y + 12),
1129 char(z), char(z + 4), char(z + 8), char(z + 12),
1130 char(w), char(w + 4), char(w + 8), char(w + 12) });
1131
1132 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1133
1134 Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1135 Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1136
1137 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1138 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1139
1140 // after pshufb: group components together in each 128bit lane
1141 // 256i - 0 1 2 3 4 5 6 7
1142 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1143
1144 Value *vi128XY_lo = nullptr;
1145 Value *vi128XY_hi = nullptr;
1146 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1147 {
1148 vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1149 vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1150
1151 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1152 // 256i - 0 1 2 3 4 5 6 7
1153 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1154 }
1155
1156 // do the same for zw components
1157 Value *vi128ZW_lo = nullptr;
1158 Value *vi128ZW_hi = nullptr;
1159 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1160 {
1161 vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1162 vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1163 }
1164
1165 // init denormalize variables if needed
1166 Instruction::CastOps fpCast;
1167 Value *conversionFactor;
1168
1169 switch (conversionType)
1170 {
1171 case CONVERT_NORMALIZED:
1172 fpCast = Instruction::CastOps::SIToFP;
1173 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1174 break;
1175 case CONVERT_SSCALED:
1176 fpCast = Instruction::CastOps::SIToFP;
1177 conversionFactor = VIMMED1((float)(1.0));
1178 break;
1179 case CONVERT_USCALED:
1180 SWR_INVALID("Type should not be sign extended!");
1181 conversionFactor = nullptr;
1182 break;
1183 default:
1184 SWR_ASSERT(conversionType == CONVERT_NONE);
1185 conversionFactor = nullptr;
1186 break;
1187 }
1188
1189 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1190 for (uint32_t i = 0; i < 4; i++)
1191 {
1192 if (isComponentEnabled(compMask, i))
1193 {
1194 if (compCtrl[i] == ComponentControl::StoreSrc)
1195 {
1196 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1197 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1198 // if x or y, use vi128XY permute result, else use vi128ZW
1199 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1200 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1201
1202 // sign extend
1203 Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1204 Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1205
1206 Value* temp = JOIN_16(temp_lo, temp_hi);
1207
1208 // denormalize if needed
1209 if (conversionType != CONVERT_NONE)
1210 {
1211 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1212 }
1213
1214 vVertexElements[currentVertexElement] = temp;
1215
1216 currentVertexElement += 1;
1217 }
1218 else
1219 {
1220 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1221 }
1222
1223 if (currentVertexElement > 3)
1224 {
1225 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1226 // reset to the next vVertexElement to output
1227 currentVertexElement = 0;
1228 }
1229 }
1230 }
1231 }
1232 // else zero extend
1233 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1234 {
1235 // init denormalize variables if needed
1236 Instruction::CastOps fpCast;
1237 Value *conversionFactor;
1238
1239 switch (conversionType)
1240 {
1241 case CONVERT_NORMALIZED:
1242 fpCast = Instruction::CastOps::UIToFP;
1243 conversionFactor = VIMMED1((float)(1.0 / 255.0));
1244 break;
1245 case CONVERT_USCALED:
1246 fpCast = Instruction::CastOps::UIToFP;
1247 conversionFactor = VIMMED1((float)(1.0));
1248 break;
1249 case CONVERT_SSCALED:
1250 SWR_INVALID("Type should not be zero extended!");
1251 conversionFactor = nullptr;
1252 break;
1253 default:
1254 SWR_ASSERT(conversionType == CONVERT_NONE);
1255 conversionFactor = nullptr;
1256 break;
1257 }
1258
1259 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1260 for (uint32_t i = 0; i < 4; i++)
1261 {
1262 if (isComponentEnabled(compMask, i))
1263 {
1264 if (compCtrl[i] == ComponentControl::StoreSrc)
1265 {
1266 // pshufb masks for each component
1267 Value *vConstMask;
1268 switch (swizzle[i])
1269 {
1270 case 0:
1271 // x shuffle mask
1272 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1273 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1274 break;
1275 case 1:
1276 // y shuffle mask
1277 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1278 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1279 break;
1280 case 2:
1281 // z shuffle mask
1282 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1283 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1284 break;
1285 case 3:
1286 // w shuffle mask
1287 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1288 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1289 break;
1290 default:
1291 vConstMask = nullptr;
1292 break;
1293 }
1294
1295 Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1296 Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1297
1298 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1299 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1300
1301 // after pshufb for x channel
1302 // 256i - 0 1 2 3 4 5 6 7
1303 // x000 x000 x000 x000 x000 x000 x000 x000
1304
1305 Value* temp = JOIN_16(temp_lo, temp_hi);
1306
1307 // denormalize if needed
1308 if (conversionType != CONVERT_NONE)
1309 {
1310 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1311 }
1312
1313 vVertexElements[currentVertexElement] = temp;
1314
1315 currentVertexElement += 1;
1316 }
1317 else
1318 {
1319 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1320 }
1321
1322 if (currentVertexElement > 3)
1323 {
1324 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1325 // reset to the next vVertexElement to output
1326 currentVertexElement = 0;
1327 }
1328 }
1329 }
1330 }
1331 else
1332 {
1333 SWR_INVALID("Unsupported conversion type");
1334 }
1335 }
1336
1337 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1338 {
1339 // Unpack tuple args
1340 Value*& vGatherResult = std::get<0>(args);
1341 Value* pVtxOut = std::get<1>(args);
1342 const Instruction::CastOps extendType = std::get<2>(args);
1343 const ConversionType conversionType = std::get<3>(args);
1344 uint32_t &currentVertexElement = std::get<4>(args);
1345 uint32_t &outputElt = std::get<5>(args);
1346 const ComponentEnable compMask = std::get<6>(args);
1347 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1348 Value* (&vVertexElements)[4] = std::get<8>(args);
1349 const uint32_t(&swizzle)[4] = std::get<9>(args);
1350
1351 // cast types
1352 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1353
1354 for (uint32_t i = 0; i < 4; i++)
1355 {
1356 if (!isComponentEnabled(compMask, i))
1357 continue;
1358
1359 if (compCtrl[i] == ComponentControl::StoreSrc)
1360 {
1361 std::vector<uint32_t> vShuffleMasks[4] = {
1362 { 0, 4, 8, 12, 16, 20, 24, 28 }, // x
1363 { 1, 5, 9, 13, 17, 21, 25, 29 }, // y
1364 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
1365 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
1366 };
1367
1368 Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1369 UndefValue::get(v32x8Ty),
1370 vShuffleMasks[swizzle[i]]);
1371
1372 if ((extendType == Instruction::CastOps::SExt) ||
1373 (extendType == Instruction::CastOps::SIToFP)) {
1374 switch (conversionType)
1375 {
1376 case CONVERT_NORMALIZED:
1377 val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1378 break;
1379 case CONVERT_SSCALED:
1380 val = SI_TO_FP(val, mSimdFP32Ty);
1381 break;
1382 case CONVERT_USCALED:
1383 SWR_INVALID("Type should not be sign extended!");
1384 break;
1385 default:
1386 SWR_ASSERT(conversionType == CONVERT_NONE);
1387 val = S_EXT(val, mSimdInt32Ty);
1388 break;
1389 }
1390 }
1391 else if ((extendType == Instruction::CastOps::ZExt) ||
1392 (extendType == Instruction::CastOps::UIToFP)) {
1393 switch (conversionType)
1394 {
1395 case CONVERT_NORMALIZED:
1396 val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1397 break;
1398 case CONVERT_SSCALED:
1399 SWR_INVALID("Type should not be zero extended!");
1400 break;
1401 case CONVERT_USCALED:
1402 val = UI_TO_FP(val, mSimdFP32Ty);
1403 break;
1404 default:
1405 SWR_ASSERT(conversionType == CONVERT_NONE);
1406 val = Z_EXT(val, mSimdInt32Ty);
1407 break;
1408 }
1409 }
1410 else
1411 {
1412 SWR_INVALID("Unsupported conversion type");
1413 }
1414
1415 vVertexElements[currentVertexElement++] = val;
1416 }
1417 else
1418 {
1419 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1420 }
1421
1422 if (currentVertexElement > 3)
1423 {
1424 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1425 // reset to the next vVertexElement to output
1426 currentVertexElement = 0;
1427 }
1428 }
1429 }
1430
1431 //////////////////////////////////////////////////////////////////////////
1432 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1433 /// denormalizes if needed, converts to F32 if needed, and positions in
1434 // the proper SIMD rows to be output to the simdvertex structure
1435 /// @param args: (tuple of args, listed below)
1436 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1437 /// @param pVtxOut - base pointer to output simdvertex struct
1438 /// @param extendType - sign extend or zero extend
1439 /// @param bNormalized - do we need to denormalize?
1440 /// @param currentVertexElement - reference to the current vVertexElement
1441 /// @param outputElt - reference to the current offset from simdvertex we're o
1442 /// @param compMask - component packing mask
1443 /// @param compCtrl - component control val
1444 /// @param vVertexElements[4] - vertex components to output
1445 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
1446 {
1447 // Unpack tuple args
1448 Value* (&vGatherResult)[2] = std::get<0>(args);
1449 Value* pVtxOut = std::get<1>(args);
1450 const Instruction::CastOps extendType = std::get<2>(args);
1451 const ConversionType conversionType = std::get<3>(args);
1452 uint32_t &currentVertexElement = std::get<4>(args);
1453 uint32_t &outputElt = std::get<5>(args);
1454 const ComponentEnable compMask = std::get<6>(args);
1455 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1456 Value* (&vVertexElements)[4] = std::get<8>(args);
1457
1458 // cast types
1459 Type *vGatherTy = VectorType::get(mInt32Ty, 8);
1460 Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
1461
1462 // have to do extra work for sign extending
1463 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1464 {
1465 // is this PP float?
1466 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1467
1468 Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1469 Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1470
1471 // shuffle mask
1472 Value *vConstMask = C<uint8_t>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1473 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1474 Value *vi128XY_lo = nullptr;
1475 Value *vi128XY_hi = nullptr;
1476 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1477 {
1478 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1479
1480 Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1481 Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1482
1483 Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1484 Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1485
1486 // after pshufb: group components together in each 128bit lane
1487 // 256i - 0 1 2 3 4 5 6 7
1488 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1489
1490 vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1491 vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1492
1493 // after PERMD: move and pack xy components into each 128bit lane
1494 // 256i - 0 1 2 3 4 5 6 7
1495 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1496 }
1497
1498 // do the same for zw components
1499 Value *vi128ZW_lo = nullptr;
1500 Value *vi128ZW_hi = nullptr;
1501 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1502 {
1503 Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1504 Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1505
1506 Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1507 Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1508
1509 vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1510 vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1511 }
1512
1513 // init denormalize variables if needed
1514 Instruction::CastOps IntToFpCast;
1515 Value *conversionFactor;
1516
1517 switch (conversionType)
1518 {
1519 case CONVERT_NORMALIZED:
1520 IntToFpCast = Instruction::CastOps::SIToFP;
1521 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1522 break;
1523 case CONVERT_SSCALED:
1524 IntToFpCast = Instruction::CastOps::SIToFP;
1525 conversionFactor = VIMMED1((float)(1.0));
1526 break;
1527 case CONVERT_USCALED:
1528 SWR_INVALID("Type should not be sign extended!");
1529 conversionFactor = nullptr;
1530 break;
1531 default:
1532 SWR_ASSERT(conversionType == CONVERT_NONE);
1533 conversionFactor = nullptr;
1534 break;
1535 }
1536
1537 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1538 for (uint32_t i = 0; i < 4; i++)
1539 {
1540 if (isComponentEnabled(compMask, i))
1541 {
1542 if (compCtrl[i] == ComponentControl::StoreSrc)
1543 {
1544 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1545 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1546 // if x or y, use vi128XY permute result, else use vi128ZW
1547 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1548 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1549
1550 if (bFP)
1551 {
1552 // extract 128 bit lanes to sign extend each component
1553 /// @todo Force 8-wide cvt until we support generic cvt in x86 lowering pass
1554 Function* pCvtPh2Ps = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_vcvtph2ps_256);
1555 Value *temp_lo = CALL(pCvtPh2Ps, BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1556 Value *temp_hi = CALL(pCvtPh2Ps, BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1557
1558 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1559 }
1560 else
1561 {
1562 // extract 128 bit lanes to sign extend each component
1563 Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1564 Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1565
1566 Value* temp = JOIN_16(temp_lo, temp_hi);
1567
1568 // denormalize if needed
1569 if (conversionType != CONVERT_NONE)
1570 {
1571 temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1572 }
1573
1574 vVertexElements[currentVertexElement] = temp;
1575 }
1576
1577 currentVertexElement += 1;
1578 }
1579 else
1580 {
1581 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1582 }
1583
1584 if (currentVertexElement > 3)
1585 {
1586 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1587 // reset to the next vVertexElement to output
1588 currentVertexElement = 0;
1589 }
1590 }
1591 }
1592 }
1593 // else zero extend
1594 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1595 {
1596 // pshufb masks for each component
1597 Value *vConstMask[2];
1598
1599 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1600 {
1601 // x/z shuffle mask
1602 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1603 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1604 }
1605
1606 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1607 {
1608 // y/w shuffle mask
1609 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1610 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1611 }
1612
1613 // init denormalize variables if needed
1614 Instruction::CastOps fpCast;
1615 Value* conversionFactor;
1616
1617 switch (conversionType)
1618 {
1619 case CONVERT_NORMALIZED:
1620 fpCast = Instruction::CastOps::UIToFP;
1621 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1622 break;
1623 case CONVERT_USCALED:
1624 fpCast = Instruction::CastOps::UIToFP;
1625 conversionFactor = VIMMED1((float)(1.0f));
1626 break;
1627 case CONVERT_SSCALED:
1628 SWR_INVALID("Type should not be zero extended!");
1629 conversionFactor = nullptr;
1630 break;
1631 default:
1632 SWR_ASSERT(conversionType == CONVERT_NONE);
1633 conversionFactor = nullptr;
1634 break;
1635 }
1636
1637 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1638 for (uint32_t i = 0; i < 4; i++)
1639 {
1640 if (isComponentEnabled(compMask, i))
1641 {
1642 if (compCtrl[i] == ComponentControl::StoreSrc)
1643 {
1644 // select correct constMask for x/z or y/w pshufb
1645 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1646 // if x or y, use vi128XY permute result, else use vi128ZW
1647 uint32_t selectedGather = (i < 2) ? 0 : 1;
1648
1649 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1650
1651 Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1652 Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1653
1654 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1655 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1656
1657 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1658 // 256i - 0 1 2 3 4 5 6 7
1659 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1660
1661 Value* temp = JOIN_16(temp_lo, temp_hi);
1662
1663 // denormalize if needed
1664 if (conversionType != CONVERT_NONE)
1665 {
1666 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1667 }
1668
1669 vVertexElements[currentVertexElement] = temp;
1670
1671 currentVertexElement += 1;
1672 }
1673 else
1674 {
1675 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1676 }
1677
1678 if (currentVertexElement > 3)
1679 {
1680 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1681 // reset to the next vVertexElement to output
1682 currentVertexElement = 0;
1683 }
1684 }
1685 }
1686 }
1687 else
1688 {
1689 SWR_INVALID("Unsupported conversion type");
1690 }
1691 }
1692
1693 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1694 {
1695 // Unpack tuple args
1696 Value* (&vGatherResult)[2] = std::get<0>(args);
1697 Value* pVtxOut = std::get<1>(args);
1698 const Instruction::CastOps extendType = std::get<2>(args);
1699 const ConversionType conversionType = std::get<3>(args);
1700 uint32_t &currentVertexElement = std::get<4>(args);
1701 uint32_t &outputElt = std::get<5>(args);
1702 const ComponentEnable compMask = std::get<6>(args);
1703 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1704 Value* (&vVertexElements)[4] = std::get<8>(args);
1705
1706 // cast types
1707 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1708 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1709
1710 // have to do extra work for sign extending
1711 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
1712 (extendType == Instruction::CastOps::FPExt))
1713 {
1714 // is this PP float?
1715 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1716
1717 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1718 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1719
1720 // shuffle mask
1721 Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1722 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
1723 Value* vi128XY = nullptr;
1724 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
1725 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1726 // after pshufb: group components together in each 128bit lane
1727 // 256i - 0 1 2 3 4 5 6 7
1728 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1729
1730 vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1731 // after PERMD: move and pack xy components into each 128bit lane
1732 // 256i - 0 1 2 3 4 5 6 7
1733 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1734 }
1735
1736 // do the same for zw components
1737 Value* vi128ZW = nullptr;
1738 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
1739 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1740 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
1741 }
1742
1743 // init denormalize variables if needed
1744 Instruction::CastOps IntToFpCast;
1745 Value* conversionFactor;
1746
1747 switch (conversionType)
1748 {
1749 case CONVERT_NORMALIZED:
1750 IntToFpCast = Instruction::CastOps::SIToFP;
1751 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1752 break;
1753 case CONVERT_SSCALED:
1754 IntToFpCast = Instruction::CastOps::SIToFP;
1755 conversionFactor = VIMMED1((float)(1.0));
1756 break;
1757 case CONVERT_USCALED:
1758 SWR_INVALID("Type should not be sign extended!");
1759 conversionFactor = nullptr;
1760 break;
1761 default:
1762 SWR_ASSERT(conversionType == CONVERT_NONE);
1763 conversionFactor = nullptr;
1764 break;
1765 }
1766
1767 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1768 for (uint32_t i = 0; i < 4; i++)
1769 {
1770 if (isComponentEnabled(compMask, i))
1771 {
1772 if (compCtrl[i] == ComponentControl::StoreSrc)
1773 {
1774 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1775 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1776 // if x or y, use vi128XY permute result, else use vi128ZW
1777 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1778
1779 if (bFP) {
1780 // extract 128 bit lanes to sign extend each component
1781 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1782 }
1783 else {
1784 // extract 128 bit lanes to sign extend each component
1785 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1786
1787 // denormalize if needed
1788 if (conversionType != CONVERT_NONE) {
1789 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1790 }
1791 }
1792 currentVertexElement++;
1793 }
1794 else
1795 {
1796 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1797 }
1798
1799 if (currentVertexElement > 3)
1800 {
1801 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1802 // reset to the next vVertexElement to output
1803 currentVertexElement = 0;
1804 }
1805 }
1806 }
1807 }
1808 // else zero extend
1809 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1810 {
1811 // pshufb masks for each component
1812 Value* vConstMask[2];
1813 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
1814 // x/z shuffle mask
1815 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1816 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1817 }
1818
1819 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
1820 // y/w shuffle mask
1821 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1822 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
1823 }
1824
1825 // init denormalize variables if needed
1826 Instruction::CastOps fpCast;
1827 Value* conversionFactor;
1828
1829 switch (conversionType)
1830 {
1831 case CONVERT_NORMALIZED:
1832 fpCast = Instruction::CastOps::UIToFP;
1833 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1834 break;
1835 case CONVERT_USCALED:
1836 fpCast = Instruction::CastOps::UIToFP;
1837 conversionFactor = VIMMED1((float)(1.0f));
1838 break;
1839 case CONVERT_SSCALED:
1840 SWR_INVALID("Type should not be zero extended!");
1841 conversionFactor = nullptr;
1842 break;
1843 default:
1844 SWR_ASSERT(conversionType == CONVERT_NONE);
1845 conversionFactor = nullptr;
1846 break;
1847 }
1848
1849 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1850 for (uint32_t i = 0; i < 4; i++)
1851 {
1852 if (isComponentEnabled(compMask, i))
1853 {
1854 if (compCtrl[i] == ComponentControl::StoreSrc)
1855 {
1856 // select correct constMask for x/z or y/w pshufb
1857 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1858 // if x or y, use vi128XY permute result, else use vi128ZW
1859 uint32_t selectedGather = (i < 2) ? 0 : 1;
1860
1861 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1862 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1863 // 256i - 0 1 2 3 4 5 6 7
1864 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1865
1866 // denormalize if needed
1867 if (conversionType != CONVERT_NONE)
1868 {
1869 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1870 }
1871 currentVertexElement++;
1872 }
1873 else
1874 {
1875 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1876 }
1877
1878 if (currentVertexElement > 3)
1879 {
1880 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1881 // reset to the next vVertexElement to output
1882 currentVertexElement = 0;
1883 }
1884 }
1885 }
1886 }
1887 else
1888 {
1889 SWR_INVALID("Unsupported conversion type");
1890 }
1891 }
1892
1893 //////////////////////////////////////////////////////////////////////////
1894 /// @brief Output a simdvertex worth of elements to the current outputElt
1895 /// @param pVtxOut - base address of VIN output struct
1896 /// @param outputElt - simdvertex offset in VIN to write to
1897 /// @param numEltsToStore - number of simdvertex rows to write out
1898 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1899 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1900 {
1901 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1902
1903 for (uint32_t c = 0; c < numEltsToStore; ++c)
1904 {
1905 // STORE expects FP32 x vWidth type, just bitcast if needed
1906 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
1907 {
1908 #if FETCH_DUMP_VERTEX
1909 PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
1910 #endif
1911 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1912 }
1913 #if FETCH_DUMP_VERTEX
1914 else
1915 {
1916 PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
1917 }
1918 #endif
1919 // outputElt * 4 = offsetting by the size of a simdvertex
1920 // + c offsets to a 32bit x vWidth row within the current vertex
1921 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1922 STORE(vVertexElements[c], dest);
1923 }
1924 }
1925
1926 //////////////////////////////////////////////////////////////////////////
1927 /// @brief Generates a constant vector of values based on the
1928 /// ComponentControl value
1929 /// @param ctrl - ComponentControl value
1930 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1931 {
1932 switch (ctrl)
1933 {
1934 case NoStore:
1935 return VUNDEF_I();
1936 case Store0:
1937 return VIMMED1(0);
1938 case Store1Fp:
1939 return VIMMED1(1.0f);
1940 case Store1Int:
1941 return VIMMED1(1);
1942 case StoreVertexId:
1943 {
1944 if (mVWidth == 16)
1945 {
1946 Type* pSimd8FPTy = VectorType::get(mFP32Ty, 8);
1947 Value *pIdLo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), pSimd8FPTy);
1948 Value *pIdHi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), pSimd8FPTy);
1949 return JOIN_16(pIdLo, pIdHi);
1950 }
1951 else
1952 {
1953 return BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1954 }
1955 }
1956 case StoreInstanceId:
1957 {
1958 Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1959 return VBROADCAST(pId);
1960 }
1961
1962
1963 case StoreSrc:
1964 default:
1965 SWR_INVALID("Invalid component control");
1966 return VUNDEF_I();
1967 }
1968 }
1969
1970 //////////////////////////////////////////////////////////////////////////
1971 /// @brief Returns the enable mask for the specified component.
1972 /// @param enableMask - enable bits
1973 /// @param component - component to check if enabled.
1974 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1975 {
1976 switch (component)
1977 {
1978 // X
1979 case 0: return (enableMask & ComponentEnable::X);
1980 // Y
1981 case 1: return (enableMask & ComponentEnable::Y);
1982 // Z
1983 case 2: return (enableMask & ComponentEnable::Z);
1984 // W
1985 case 3: return (enableMask & ComponentEnable::W);
1986
1987 default: return false;
1988 }
1989 }
1990
1991 // Don't want two threads compiling the same fetch shader simultaneously
1992 // Has problems in the JIT cache implementation
1993 // This is only a problem for fetch right now.
1994 static std::mutex gFetchCodegenMutex;
1995
1996 //////////////////////////////////////////////////////////////////////////
1997 /// @brief JITs from fetch shader IR
1998 /// @param hJitMgr - JitManager handle
1999 /// @param func - LLVM function IR
2000 /// @return PFN_FETCH_FUNC - pointer to fetch code
2001 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2002 {
2003 const llvm::Function* func = (const llvm::Function*)hFunc;
2004 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2005 PFN_FETCH_FUNC pfnFetch;
2006
2007 gFetchCodegenMutex.lock();
2008 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2009 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2010 pJitMgr->mIsModuleFinalized = true;
2011
2012 #if defined(KNOB_SWRC_TRACING)
2013 char fName[1024];
2014 const char *funcName = func->getName().data();
2015 sprintf(fName, "%s.bin", funcName);
2016 FILE *fd = fopen(fName, "wb");
2017 fwrite((void *)pfnFetch, 1, 2048, fd);
2018 fclose(fd);
2019 #endif
2020
2021 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2022 gFetchCodegenMutex.unlock();
2023
2024
2025
2026 return pfnFetch;
2027 }
2028
2029 //////////////////////////////////////////////////////////////////////////
2030 /// @brief JIT compiles fetch shader
2031 /// @param hJitMgr - JitManager handle
2032 /// @param state - fetch state to build function from
2033 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2034 {
2035 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2036
2037 pJitMgr->SetupNewModule();
2038
2039 FetchJit theJit(pJitMgr);
2040 HANDLE hFunc = theJit.Create(state);
2041
2042 return JitFetchFunc(hJitMgr, hFunc);
2043 }