f1dc00293afa545787c5a81ec41488870becb8c4
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "jit_api.h"
33 #include "fetch_jit.h"
34 #include "gen_state_llvm.h"
35
36 //#define FETCH_DUMP_VERTEX 1
37 using namespace llvm;
38 using namespace SwrJit;
39
40 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
41
42 enum ConversionType
43 {
44 CONVERT_NONE,
45 CONVERT_NORMALIZED,
46 CONVERT_USCALED,
47 CONVERT_SSCALED,
48 CONVERT_SFIXED,
49 };
50
51 #if USE_SIMD16_SHADERS
52 #define USE_SIMD16_GATHERS 0
53 #endif
54
55 //////////////////////////////////////////////////////////////////////////
56 /// Interface to Jitting a fetch shader
57 //////////////////////////////////////////////////////////////////////////
58 struct FetchJit :
59 public Builder
60 {
61 FetchJit(JitManager* pJitMgr) :
62 Builder(pJitMgr)
63 {}
64
65 Function* Create(const FETCH_COMPILE_STATE& fetchState);
66
67 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
68 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
69 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
70
71 // package up Shuffle*bpcGatherd args into a tuple for convenience
72 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
73 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
74 const uint32_t(&)[4]> Shuffle8bpcArgs;
75
76 #if USE_SIMD16_SHADERS
77 #if USE_SIMD16_GATHERS
78 void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
79 #else
80 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
81 #endif
82 #else
83 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
84 #endif
85
86 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
87 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
88
89 #if USE_SIMD16_SHADERS
90 #if USE_SIMD16_GATHERS
91 void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
92 #else
93 void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
94 #endif
95 #else
96 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
97 #endif
98
99 #if USE_SIMD16_GATHERS
100 void StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
101 #else
102 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
103 #endif
104
105 #if USE_SIMD16_SHADERS
106 #if USE_SIMD16_GATHERS
107 Value *GenerateCompCtrlVector16(const ComponentControl ctrl);
108 #else
109 Value *GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
110 #endif
111 #else
112 Value *GenerateCompCtrlVector(const ComponentControl ctrl);
113 #endif
114
115 void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
116
117 #if USE_SIMD16_SHADERS
118 #if USE_SIMD16_GATHERS
119 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
120 #else
121 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
122 #endif
123 #else
124 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
125 #endif
126
127 bool IsOddFormat(SWR_FORMAT format);
128 bool IsUniformFormat(SWR_FORMAT format);
129 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
130 void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
131 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
132
133 Value* mpFetchInfo;
134 };
135
136 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
137 {
138 std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
139 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
140
141 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
142 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
143
144 fetch->getParent()->setModuleIdentifier(fetch->getName());
145
146 IRB()->SetInsertPoint(entry);
147
148 auto argitr = fetch->arg_begin();
149
150 // Fetch shader arguments
151 Value* privateContext = &*argitr; ++argitr;
152 privateContext->setName("privateContext");
153 SetPrivateContext(privateContext);
154
155 mpFetchInfo = &*argitr; ++argitr;
156 mpFetchInfo->setName("fetchInfo");
157 Value* pVtxOut = &*argitr;
158 pVtxOut->setName("vtxOutput");
159 // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
160 // index 0(just the pointer to the simdvertex structure
161 // index 1(which element of the simdvertex structure to offset to(in this case 0)
162 // so the indices being i32's doesn't matter
163 // TODO: generated this GEP with a VECTOR structure type so this makes sense
164 std::vector<Value*> vtxInputIndices(2, C(0));
165 // GEP
166 pVtxOut = GEP(pVtxOut, C(0));
167 #if USE_SIMD16_SHADERS
168 #if 0// USE_SIMD16_BUILDER
169 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
170 #else
171 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
172 #endif
173 #else
174 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
175 #endif
176
177 // SWR_FETCH_CONTEXT::pStreams
178 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
179 streams->setName("pStreams");
180
181 // SWR_FETCH_CONTEXT::pIndices
182 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
183 indices->setName("pIndices");
184
185 // SWR_FETCH_CONTEXT::pLastIndex
186 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
187 pLastIndex->setName("pLastIndex");
188
189
190 Value* vIndices;
191 #if USE_SIMD16_SHADERS
192 Value* indices2;
193 Value* vIndices2;
194 #endif
195 switch(fetchState.indexType)
196 {
197 case R8_UINT:
198 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
199 #if USE_SIMD16_SHADERS
200 indices2 = GEP(indices, C(8));
201 #endif
202 if(fetchState.bDisableIndexOOBCheck)
203 {
204 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
205 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
206 #if USE_SIMD16_SHADERS
207 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
208 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
209 #endif
210 }
211 else
212 {
213 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
214 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
215 #if USE_SIMD16_SHADERS
216 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
217 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
218 #endif
219 }
220 break;
221 case R16_UINT:
222 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
223 #if USE_SIMD16_SHADERS
224 indices2 = GEP(indices, C(8));
225 #endif
226 if(fetchState.bDisableIndexOOBCheck)
227 {
228 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
229 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
230 #if USE_SIMD16_SHADERS
231 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
232 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
233 #endif
234 }
235 else
236 {
237 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
238 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
239 #if USE_SIMD16_SHADERS
240 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
241 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
242 #endif
243 }
244 break;
245 case R32_UINT:
246 #if USE_SIMD16_SHADERS
247 indices2 = GEP(indices, C(8));
248 #endif
249 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
250 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
251 #if USE_SIMD16_SHADERS
252 (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
253 : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
254 #endif
255 break; // incoming type is already 32bit int
256 default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
257 }
258
259 if(fetchState.bForceSequentialAccessEnable)
260 {
261 Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
262
263 // VertexData buffers are accessed sequentially, the index is equal to the vertex number
264 vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
265 vIndices = ADD(vIndices, pOffsets);
266 #if USE_SIMD16_SHADERS
267 vIndices2 = ADD(vIndices, VIMMED1(8));
268 #endif
269 }
270
271 Value* vVertexId = vIndices;
272 #if USE_SIMD16_SHADERS
273 Value* vVertexId2 = vIndices2;
274 #endif
275 if (fetchState.bVertexIDOffsetEnable)
276 {
277 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
278 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
279 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
280 vVertexId = ADD(vIndices, vBaseVertex);
281 vVertexId = ADD(vVertexId, vStartVertex);
282 #if USE_SIMD16_SHADERS
283 vVertexId2 = ADD(vIndices2, vBaseVertex);
284 vVertexId2 = ADD(vVertexId2, vStartVertex);
285 #endif
286 }
287
288 // store out vertex IDs
289 STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
290 #if USE_SIMD16_SHADERS
291 STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
292 #endif
293
294 // store out cut mask if enabled
295 if (fetchState.bEnableCutIndex)
296 {
297 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
298 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
299 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
300 #if USE_SIMD16_SHADERS
301 Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
302 STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
303 #endif
304 }
305
306 // Fetch attributes from memory and output to a simdvertex struct
307 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
308 #if USE_SIMD16_SHADERS
309 if (fetchState.bDisableVGATHER)
310 {
311 JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
312 JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
313 }
314 else
315 {
316 #if USE_SIMD16_GATHERS
317 JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
318 #else
319 JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
320 JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
321 #endif
322 }
323 #else
324 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
325 : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
326 #endif
327
328 RET_VOID();
329
330 JitManager::DumpToFile(fetch, "src");
331
332 #if defined(_DEBUG)
333 verifyFunction(*fetch);
334 #endif
335
336 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
337
338 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
339 setupPasses.add(createBreakCriticalEdgesPass());
340 setupPasses.add(createCFGSimplificationPass());
341 setupPasses.add(createEarlyCSEPass());
342 setupPasses.add(createPromoteMemoryToRegisterPass());
343
344 setupPasses.run(*fetch);
345
346 JitManager::DumpToFile(fetch, "se");
347
348 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
349
350 ///@todo Haven't touched these either. Need to remove some of these and add others.
351 optPasses.add(createCFGSimplificationPass());
352 optPasses.add(createEarlyCSEPass());
353 optPasses.add(createInstructionCombiningPass());
354 optPasses.add(createInstructionSimplifierPass());
355 optPasses.add(createConstantPropagationPass());
356 optPasses.add(createSCCPPass());
357 optPasses.add(createAggressiveDCEPass());
358
359 optPasses.run(*fetch);
360 optPasses.run(*fetch);
361
362 JitManager::DumpToFile(fetch, "opt");
363
364
365 return fetch;
366 }
367
368 //////////////////////////////////////////////////////////////////////////
369 /// @brief Loads attributes from memory using LOADs, shuffling the
370 /// components into SOA form.
371 /// *Note* currently does not support component control,
372 /// component packing, instancing
373 /// @param fetchState - info about attributes to be fetched from memory
374 /// @param streams - value pointer to the current vertex stream
375 /// @param vIndices - vector value of indices to load
376 /// @param pVtxOut - value pointer to output simdvertex struct
377 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
378 {
379 // Zack shuffles; a variant of the Charleston.
380
381 std::vector<Value*> vectors(16);
382 std::vector<Constant*> pMask(mVWidth);
383 for(uint32_t i = 0; i < mVWidth; ++i)
384 {
385 pMask[i] = (C(i < 4 ? i : 4));
386 }
387 Constant* promoteMask = ConstantVector::get(pMask);
388 Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
389
390 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
391 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
392 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
393 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
394 curInstance->setName("curInstance");
395
396 for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
397 {
398 Value* elements[4] = {0};
399 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
400 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
401 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
402 uint32_t numComponents = info.numComps;
403 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
404
405 // load path doesn't support component packing
406 SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
407
408 vectors.clear();
409
410 if (fetchState.bInstanceIDOffsetEnable)
411 {
412 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
413 }
414
415 Value *vCurIndices;
416 Value *startOffset;
417 if(ied.InstanceEnable)
418 {
419 Value* stepRate = C(ied.InstanceAdvancementState);
420
421 // prevent a div by 0 for 0 step rate
422 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
423 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
424
425 // calc the current offset into instanced data buffer
426 Value* calcInstance = UDIV(curInstance, stepRate);
427
428 // if step rate is 0, every instance gets instance 0
429 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
430
431 vCurIndices = VBROADCAST(calcInstance);
432
433 startOffset = startInstance;
434 }
435 else if (ied.InstanceStrideEnable)
436 {
437 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
438 }
439 else
440 {
441 // offset indices by baseVertex
442 vCurIndices = ADD(vIndices, vBaseVertex);
443
444 startOffset = startVertex;
445 }
446
447 // load SWR_VERTEX_BUFFER_STATE::pData
448 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
449
450 // load SWR_VERTEX_BUFFER_STATE::pitch
451 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
452 stride = Z_EXT(stride, mInt64Ty);
453
454 // load SWR_VERTEX_BUFFER_STATE::size
455 Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
456 size = Z_EXT(size, mInt64Ty);
457
458 Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
459
460 Value *minVertex = NULL;
461 Value *minVertexOffset = NULL;
462 if (fetchState.bPartialVertexBuffer) {
463 // fetch min index for low bounds checking
464 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
465 minVertex = LOAD(minVertex);
466 if (!fetchState.bDisableIndexOOBCheck) {
467 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
468 }
469 }
470
471 // Load from the stream.
472 for(uint32_t lane = 0; lane < mVWidth; ++lane)
473 {
474 // Get index
475 Value* index = VEXTRACT(vCurIndices, C(lane));
476
477 if (fetchState.bPartialVertexBuffer) {
478 // clamp below minvertex
479 Value *isBelowMin = ICMP_SLT(index, minVertex);
480 index = SELECT(isBelowMin, minVertex, index);
481 }
482
483 index = Z_EXT(index, mInt64Ty);
484
485 Value* offset = MUL(index, stride);
486 offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
487 offset = ADD(offset, startVertexOffset);
488
489 if (!fetchState.bDisableIndexOOBCheck) {
490 // check for out of bound access, including partial OOB, and replace them with minVertex
491 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
492 Value *oob = ICMP_ULE(endOffset, size);
493 if (fetchState.bPartialVertexBuffer) {
494 offset = SELECT(oob, offset, minVertexOffset);
495 } else {
496 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
497 }
498 }
499
500 Value* pointer = GEP(stream, offset);
501 // We use a full-lane, but don't actually care.
502 Value* vptr = 0;
503
504 // get a pointer to a 4 component attrib in default address space
505 switch(bpc)
506 {
507 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
508 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
509 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
510 default: SWR_INVALID("Unsupported underlying bpp!");
511 }
512
513 // load 4 components of attribute
514 Value* vec = ALIGNED_LOAD(vptr, 1, false);
515
516 // Convert To FP32 internally
517 switch(info.type[0])
518 {
519 case SWR_TYPE_UNORM:
520 switch(bpc)
521 {
522 case 8:
523 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
524 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
525 break;
526 case 16:
527 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
528 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
529 break;
530 default:
531 SWR_INVALID("Unsupported underlying type!");
532 break;
533 }
534 break;
535 case SWR_TYPE_SNORM:
536 switch(bpc)
537 {
538 case 8:
539 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
540 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
541 break;
542 case 16:
543 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
544 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
545 break;
546 default:
547 SWR_INVALID("Unsupported underlying type!");
548 break;
549 }
550 break;
551 case SWR_TYPE_UINT:
552 // Zero extend uint32_t types.
553 switch(bpc)
554 {
555 case 8:
556 case 16:
557 vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
558 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
559 break;
560 case 32:
561 break; // Pass through unchanged.
562 default:
563 SWR_INVALID("Unsupported underlying type!");
564 break;
565 }
566 break;
567 case SWR_TYPE_SINT:
568 // Sign extend SINT types.
569 switch(bpc)
570 {
571 case 8:
572 case 16:
573 vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
574 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
575 break;
576 case 32:
577 break; // Pass through unchanged.
578 default:
579 SWR_INVALID("Unsupported underlying type!");
580 break;
581 }
582 break;
583 case SWR_TYPE_FLOAT:
584 switch(bpc)
585 {
586 case 32:
587 break; // Pass through unchanged.
588 default:
589 SWR_INVALID("Unsupported underlying type!");
590 }
591 break;
592 case SWR_TYPE_USCALED:
593 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
594 break;
595 case SWR_TYPE_SSCALED:
596 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
597 break;
598 case SWR_TYPE_SFIXED:
599 vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
600 break;
601 case SWR_TYPE_UNKNOWN:
602 case SWR_TYPE_UNUSED:
603 SWR_INVALID("Unsupported type %d!", info.type[0]);
604 }
605
606 // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
607 // uwvec: 4 x F32, undef value
608 Value* wvec = VSHUFFLE(vec, uwvec, promoteMask);
609 vectors.push_back(wvec);
610 }
611
612 std::vector<Constant*> v01Mask(mVWidth);
613 std::vector<Constant*> v23Mask(mVWidth);
614 std::vector<Constant*> v02Mask(mVWidth);
615 std::vector<Constant*> v13Mask(mVWidth);
616
617 // Concatenate the vectors together.
618 elements[0] = VUNDEF_F();
619 elements[1] = VUNDEF_F();
620 elements[2] = VUNDEF_F();
621 elements[3] = VUNDEF_F();
622 for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
623 {
624 v01Mask[4 * b + 0] = C(0 + 4 * b);
625 v01Mask[4 * b + 1] = C(1 + 4 * b);
626 v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
627 v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
628
629 v23Mask[4 * b + 0] = C(2 + 4 * b);
630 v23Mask[4 * b + 1] = C(3 + 4 * b);
631 v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
632 v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
633
634 v02Mask[4 * b + 0] = C(0 + 4 * b);
635 v02Mask[4 * b + 1] = C(2 + 4 * b);
636 v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
637 v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
638
639 v13Mask[4 * b + 0] = C(1 + 4 * b);
640 v13Mask[4 * b + 1] = C(3 + 4 * b);
641 v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
642 v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
643
644 std::vector<Constant*> iMask(mVWidth);
645 for(uint32_t i = 0; i < mVWidth; ++i)
646 {
647 if(((4 * b) <= i) && (i < (4 * (b + 1))))
648 {
649 iMask[i] = C(i % 4 + mVWidth);
650 }
651 else
652 {
653 iMask[i] = C(i);
654 }
655 }
656 Constant* insertMask = ConstantVector::get(iMask);
657 elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
658 elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
659 elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
660 elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
661 }
662
663 Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
664 Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
665 Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
666 Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
667 elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
668 elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
669 elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
670 elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
671
672 switch(numComponents + 1)
673 {
674 case 1: elements[0] = VIMMED1(0.0f);
675 case 2: elements[1] = VIMMED1(0.0f);
676 case 3: elements[2] = VIMMED1(0.0f);
677 case 4: elements[3] = VIMMED1(1.0f);
678 }
679
680 for(uint32_t c = 0; c < 4; ++c)
681 {
682 #if USE_SIMD16_SHADERS
683 Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
684 #else
685 Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
686 #endif
687 STORE(elements[c], dest);
688 }
689 }
690 }
691
692 // returns true for odd formats that require special state.gather handling
693 bool FetchJit::IsOddFormat(SWR_FORMAT format)
694 {
695 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
696 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
697 {
698 return true;
699 }
700 return false;
701 }
702
703 // format is uniform if all components are the same size and type
704 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
705 {
706 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
707 uint32_t bpc0 = info.bpc[0];
708 uint32_t type0 = info.type[0];
709
710 for (uint32_t c = 1; c < info.numComps; ++c)
711 {
712 if (bpc0 != info.bpc[c] || type0 != info.type[c])
713 {
714 return false;
715 }
716 }
717 return true;
718 }
719
720 // unpacks components based on format
721 // foreach component in the pixel
722 // mask off everything but this component
723 // shift component to LSB
724 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
725 {
726 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
727
728 uint32_t bitOffset = 0;
729 for (uint32_t c = 0; c < info.numComps; ++c)
730 {
731 uint32_t swizzledIndex = info.swizzle[c];
732 uint32_t compBits = info.bpc[c];
733 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
734 Value* comp = AND(vInput, bitmask);
735 comp = LSHR(comp, bitOffset);
736
737 result[swizzledIndex] = comp;
738 bitOffset += compBits;
739 }
740 }
741
742 // gather for odd component size formats
743 // gather SIMD full pixels per lane then shift/mask to move each component to their
744 // own vector
745 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
746 {
747 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
748
749 // only works if pixel size is <= 32bits
750 SWR_ASSERT(info.bpp <= 32);
751
752 Value *pGather;
753 if (info.bpp == 32)
754 {
755 pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
756 }
757 else
758 {
759 // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
760 Value *pMem = ALLOCA(mSimdInt32Ty);
761 STORE(VIMMED1(0u), pMem);
762
763 pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
764 Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
765
766 for (uint32_t lane = 0; lane < mVWidth; ++lane)
767 {
768 // Get index
769 Value* index = VEXTRACT(pOffsets, C(lane));
770 Value* mask = VEXTRACT(pMask, C(lane));
771 switch (info.bpp)
772 {
773 case 8:
774 {
775 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
776 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
777 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
778 break;
779 }
780
781 case 16:
782 {
783 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
784 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
785 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
786 break;
787 }
788 break;
789
790 case 24:
791 {
792 // First 16-bits of data
793 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
794 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
795 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
796
797 // Last 8-bits of data
798 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
799 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
800 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
801 break;
802 }
803
804 default:
805 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
806 break;
807 }
808 }
809
810 pGather = LOAD(pMem);
811 }
812
813 for (uint32_t comp = 0; comp < 4; ++comp)
814 {
815 pResult[comp] = VIMMED1((int)info.defaults[comp]);
816 }
817
818 UnpackComponents(format, pGather, pResult);
819
820 // cast to fp32
821 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
822 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
823 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
824 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
825 }
826
827 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
828 {
829 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
830
831 for (uint32_t c = 0; c < info.numComps; ++c)
832 {
833 uint32_t compIndex = info.swizzle[c];
834
835 // skip any conversion on UNUSED components
836 if (info.type[c] == SWR_TYPE_UNUSED)
837 {
838 continue;
839 }
840
841 if (info.isNormalized[c])
842 {
843 if (info.type[c] == SWR_TYPE_SNORM)
844 {
845 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
846
847 /// result = c * (1.0f / (2^(n-1) - 1);
848 uint32_t n = info.bpc[c];
849 uint32_t pow2 = 1 << (n - 1);
850 float scale = 1.0f / (float)(pow2 - 1);
851 Value *vScale = VIMMED1(scale);
852 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
853 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
854 texels[compIndex] = FMUL(texels[compIndex], vScale);
855 }
856 else
857 {
858 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
859
860 /// result = c * (1.0f / (2^n - 1))
861 uint32_t n = info.bpc[c];
862 uint32_t pow2 = 1 << n;
863 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
864 if (n == 24)
865 {
866 float scale = (float)(pow2 - 1);
867 Value* vScale = VIMMED1(scale);
868 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
869 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
870 texels[compIndex] = FDIV(texels[compIndex], vScale);
871 }
872 else
873 {
874 float scale = 1.0f / (float)(pow2 - 1);
875 Value *vScale = VIMMED1(scale);
876 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
877 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
878 texels[compIndex] = FMUL(texels[compIndex], vScale);
879 }
880 }
881 continue;
882 }
883 }
884 }
885
886 //////////////////////////////////////////////////////////////////////////
887 /// @brief Loads attributes from memory using AVX2 GATHER(s)
888 /// @param fetchState - info about attributes to be fetched from memory
889 /// @param streams - value pointer to the current vertex stream
890 /// @param vIndices - vector value of indices to gather
891 /// @param pVtxOut - value pointer to output simdvertex struct
892 #if USE_SIMD16_SHADERS
893 #if USE_SIMD16_GATHERS
894 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
895 Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
896 #else
897 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
898 Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
899 #endif
900 #else
901 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
902 Value* streams, Value* vIndices, Value* pVtxOut)
903 #endif
904 {
905 uint32_t currentVertexElement = 0;
906 uint32_t outputElt = 0;
907 Value* vVertexElements[4];
908 #if USE_SIMD16_GATHERS
909 Value *pVtxSrc2[4];
910 #endif
911
912 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
913 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
914 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
915 #if USE_SIMD16_GATHERS
916 Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
917 #else
918 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
919 #endif
920 curInstance->setName("curInstance");
921
922 for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
923 {
924 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
925
926 // skip element if all components are disabled
927 if (ied.ComponentPacking == ComponentEnable::NONE)
928 {
929 continue;
930 }
931
932 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
933 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
934 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
935
936 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
937
938 // VGATHER* takes an *i8 src pointer
939 Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
940
941 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
942 #if USE_SIMD16_GATHERS
943 Value *vStride16 = VBROADCAST_16(stride);
944 #else
945 Value *vStride = VBROADCAST(stride);
946 #endif
947
948 // max vertex index that is fully in bounds
949 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
950 maxVertex = LOAD(maxVertex);
951
952 Value *minVertex = NULL;
953 if (fetchState.bPartialVertexBuffer)
954 {
955 // min vertex index for low bounds OOB checking
956 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
957 minVertex = LOAD(minVertex);
958 }
959
960 if (fetchState.bInstanceIDOffsetEnable)
961 {
962 // the InstanceID (curInstance) value is offset by StartInstanceLocation
963 curInstance = ADD(curInstance, startInstance);
964 }
965
966 #if USE_SIMD16_GATHERS
967 Value *vCurIndices16;
968 #else
969 Value *vCurIndices;
970 #endif
971 Value *startOffset;
972 #if USE_SIMD16_GATHERS
973 Value *vInstanceStride16 = VIMMED1_16(0);
974 #else
975 Value *vInstanceStride = VIMMED1(0);
976 #endif
977
978 if (ied.InstanceEnable)
979 {
980 Value* stepRate = C(ied.InstanceAdvancementState);
981
982 // prevent a div by 0 for 0 step rate
983 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
984 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
985
986 // calc the current offset into instanced data buffer
987 Value* calcInstance = UDIV(curInstance, stepRate);
988
989 // if step rate is 0, every instance gets instance 0
990 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
991
992 #if USE_SIMD16_GATHERS
993 vCurIndices16 = VBROADCAST_16(calcInstance);
994 #else
995 vCurIndices = VBROADCAST(calcInstance);
996 #endif
997
998 startOffset = startInstance;
999 }
1000 else if (ied.InstanceStrideEnable)
1001 {
1002 // grab the instance advancement state, determines stride in bytes from one instance to the next
1003 Value* stepRate = C(ied.InstanceAdvancementState);
1004 #if USE_SIMD16_GATHERS
1005 vInstanceStride16 = VBROADCAST_16(MUL(curInstance, stepRate));
1006 #else
1007 vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
1008 #endif
1009
1010 // offset indices by baseVertex
1011 #if USE_SIMD16_GATHERS
1012 Value *vIndices16 = JOIN_16(vIndices, vIndices2);
1013
1014 vCurIndices16 = ADD(vIndices16, vBaseVertex16);
1015 #else
1016 vCurIndices = ADD(vIndices, vBaseVertex);
1017 #endif
1018
1019 startOffset = startVertex;
1020 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
1021 }
1022 else
1023 {
1024 // offset indices by baseVertex
1025 #if USE_SIMD16_GATHERS
1026 Value *vIndices16 = JOIN_16(vIndices, vIndices2);
1027
1028 vCurIndices16 = ADD(vIndices16, vBaseVertex16);
1029 #else
1030 vCurIndices = ADD(vIndices, vBaseVertex);
1031 #endif
1032
1033 startOffset = startVertex;
1034 }
1035
1036 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
1037 // do 64bit address offset calculations.
1038
1039 // calculate byte offset to the start of the VB
1040 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
1041 pStreamBase = GEP(pStreamBase, baseOffset);
1042 Value* pStreamBaseGFX = ADD(stream, baseOffset);
1043
1044 // if we have a start offset, subtract from max vertex. Used for OOB check
1045 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
1046 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
1047 // if we have a negative value, we're already OOB. clamp at 0.
1048 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
1049
1050 if (fetchState.bPartialVertexBuffer)
1051 {
1052 // similary for min vertex
1053 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
1054 Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
1055 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
1056 }
1057
1058 // Load the in bounds size of a partially valid vertex
1059 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
1060 partialInboundsSize = LOAD(partialInboundsSize);
1061 #if USE_SIMD16_GATHERS
1062 Value *vPartialVertexSize = VBROADCAST_16(partialInboundsSize);
1063 Value *vBpp = VBROADCAST_16(C(info.Bpp));
1064 Value *vAlignmentOffsets = VBROADCAST_16(C(ied.AlignedByteOffset));
1065 #else
1066 Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
1067 Value *vBpp = VBROADCAST(C(info.Bpp));
1068 Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
1069 #endif
1070
1071 // is the element is <= the partially valid size
1072 Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
1073
1074 #if USE_SIMD16_GATHERS
1075 // override cur indices with 0 if pitch is 0
1076 Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED1_16(0));
1077 vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED1_16(0), vCurIndices16);
1078
1079 // are vertices partially OOB?
1080 Value *vMaxVertex16 = VBROADCAST_16(maxVertex);
1081 Value *vPartialOOBMask = ICMP_EQ(vCurIndices16, vMaxVertex16);
1082
1083 // are vertices fully in bounds?
1084 Value *vMaxGatherMask16 = ICMP_ULT(vCurIndices16, vMaxVertex16);
1085
1086 Value *vGatherMask16;
1087
1088 if (fetchState.bPartialVertexBuffer)
1089 {
1090 // are vertices below minVertex limit?
1091 Value *vMinVertex16 = VBROADCAST_16(minVertex);
1092 Value *vMinGatherMask16 = ICMP_UGE(vCurIndices16, vMinVertex16);
1093
1094 // only fetch lanes that pass both tests
1095 vGatherMask16 = AND(vMaxGatherMask16, vMinGatherMask16);
1096 }
1097 else
1098 {
1099 vGatherMask16 = vMaxGatherMask16;
1100 }
1101
1102 // blend in any partially OOB indices that have valid elements
1103 vGatherMask16 = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask16);
1104
1105 // calculate the actual offsets into the VB
1106 Value *vOffsets16 = MUL(vCurIndices16, vStride16);
1107 vOffsets16 = ADD(vOffsets16, vAlignmentOffsets);
1108
1109 // if instance stride enable is:
1110 // true - add product of the instanceID and advancement state to the offst into the VB
1111 // false - value of vInstanceStride has been initialialized to zero
1112 vOffsets16 = ADD(vOffsets16, vInstanceStride16);
1113
1114 // TODO: remove the following simd8 interop stuff once all code paths are fully widened to SIMD16..
1115
1116 Value *vGatherMask = EXTRACT_16(vGatherMask16, 0);
1117 Value *vGatherMask2 = EXTRACT_16(vGatherMask16, 1);
1118
1119 Value *vOffsets = EXTRACT_16(vOffsets16, 0);
1120 Value *vOffsets2 = EXTRACT_16(vOffsets16, 1);
1121 #else
1122 // override cur indices with 0 if pitch is 0
1123 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1124 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1125
1126 // are vertices partially OOB?
1127 Value* vMaxVertex = VBROADCAST(maxVertex);
1128 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1129
1130 // are vertices fully in bounds?
1131 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1132
1133 Value *vGatherMask;
1134 if (fetchState.bPartialVertexBuffer)
1135 {
1136 // are vertices below minVertex limit?
1137 Value *vMinVertex = VBROADCAST(minVertex);
1138 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1139
1140 // only fetch lanes that pass both tests
1141 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1142 }
1143 else
1144 {
1145 vGatherMask = vMaxGatherMask;
1146 }
1147
1148 // blend in any partially OOB indices that have valid elements
1149 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1150
1151 // calculate the actual offsets into the VB
1152 Value* vOffsets = MUL(vCurIndices, vStride);
1153 vOffsets = ADD(vOffsets, vAlignmentOffsets);
1154
1155 // if instance stride enable is:
1156 // true - add product of the instanceID and advancement state to the offst into the VB
1157 // false - value of vInstanceStride has been initialialized to zero
1158 vOffsets = ADD(vOffsets, vInstanceStride);
1159
1160 #endif
1161 // Packing and component control
1162 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1163 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1164 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1165
1166 // Special gather/conversion for formats without equal component sizes
1167 if (IsOddFormat((SWR_FORMAT)ied.Format))
1168 {
1169 #if USE_SIMD16_GATHERS
1170 Value *pResults[4];
1171 Value *pResults2[4];
1172 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1173 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1174 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1175 ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1176
1177 for (uint32_t c = 0; c < 4; c += 1)
1178 {
1179 if (isComponentEnabled(compMask, c))
1180 {
1181 // pack adjacent pairs of SIMD8s into SIMD16s
1182 pVtxSrc2[currentVertexElement++] = JOIN_16(pResults[c], pResults2[c]);
1183
1184 if (currentVertexElement > 3)
1185 {
1186 // store SIMD16s
1187 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1188
1189 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1190 // reset to the next vVertexElement to output
1191 currentVertexElement = 0;
1192 }
1193 }
1194 }
1195 #else
1196 Value *pResults[4];
1197 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1198 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1199
1200 for (uint32_t c = 0; c < 4; c += 1)
1201 {
1202 if (isComponentEnabled(compMask, c))
1203 {
1204 vVertexElements[currentVertexElement++] = pResults[c];
1205 if (currentVertexElement > 3)
1206 {
1207 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1208 // reset to the next vVertexElement to output
1209 currentVertexElement = 0;
1210 }
1211 }
1212 }
1213 #endif
1214 }
1215 else if(info.type[0] == SWR_TYPE_FLOAT)
1216 {
1217 ///@todo: support 64 bit vb accesses
1218 Value *gatherSrc = VIMMED1(0.0f);
1219 #if USE_SIMD16_GATHERS
1220 Value *gatherSrc16 = VIMMED1_16(0.0f);
1221 #endif
1222
1223 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1224 "Unsupported format for standard gather fetch.");
1225
1226 // Gather components from memory to store in a simdvertex structure
1227 switch (bpc)
1228 {
1229 case 16:
1230 {
1231 #if USE_SIMD16_GATHERS
1232 Value *gatherResult[2];
1233
1234 // if we have at least one component out of x or y to fetch
1235 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1236 {
1237 gatherResult[0] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1238
1239 // e.g. result of first 8x32bit integer gather for 16bit components
1240 // 256i - 0 1 2 3 4 5 6 7
1241 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1242 //
1243 }
1244 else
1245 {
1246 gatherResult[0] = VUNDEF_I_16();
1247 }
1248
1249 // if we have at least one component out of z or w to fetch
1250 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1251 {
1252 // offset base to the next components(zw) in the vertex to gather
1253 pStreamBase = GEP(pStreamBase, C((char)4));
1254
1255 gatherResult[1] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1256
1257 // e.g. result of second 8x32bit integer gather for 16bit components
1258 // 256i - 0 1 2 3 4 5 6 7
1259 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1260 //
1261 }
1262 else
1263 {
1264 gatherResult[1] = VUNDEF_I_16();
1265 }
1266
1267 // if we have at least one component to shuffle into place
1268 if (compMask)
1269 {
1270 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1271
1272 Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
1273 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1274
1275 // Shuffle gathered components into place in simdvertex struct
1276 Shuffle16bpcGather16(args); // outputs to vVertexElements ref
1277 }
1278 #else
1279 Value *vGatherResult[2];
1280
1281 // if we have at least one component out of x or y to fetch
1282 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1283 {
1284 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1285 // e.g. result of first 8x32bit integer gather for 16bit components
1286 // 256i - 0 1 2 3 4 5 6 7
1287 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1288 //
1289 }
1290
1291 // if we have at least one component out of z or w to fetch
1292 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1293 {
1294 // offset base to the next components(zw) in the vertex to gather
1295 pStreamBase = GEP(pStreamBase, C((char)4));
1296
1297 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1298 // e.g. result of second 8x32bit integer gather for 16bit components
1299 // 256i - 0 1 2 3 4 5 6 7
1300 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1301 //
1302 }
1303
1304 // if we have at least one component to shuffle into place
1305 if (compMask)
1306 {
1307 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1308 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1309
1310 // Shuffle gathered components into place in simdvertex struct
1311 #if USE_SIMD16_SHADERS
1312 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
1313 #else
1314 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1315 #endif
1316 }
1317 #endif
1318 }
1319 break;
1320 case 32:
1321 {
1322 for (uint32_t i = 0; i < 4; i += 1)
1323 {
1324 #if USE_SIMD16_GATHERS
1325 if (isComponentEnabled(compMask, i))
1326 {
1327 // if we need to gather the component
1328 if (compCtrl[i] == StoreSrc)
1329 {
1330 // Gather a SIMD of vertices
1331 // APIs allow a 4GB range for offsets
1332 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1333 // But, we know that elements must be aligned for FETCH. :)
1334 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1335 Value *shiftedOffsets16 = LSHR(vOffsets16, 1);
1336 pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets16, vGatherMask16, 2);
1337 }
1338 else
1339 {
1340 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1341 }
1342
1343 if (currentVertexElement > 3)
1344 {
1345 // store SIMD16s
1346 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1347
1348 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1349 // reset to the next vVertexElement to output
1350 currentVertexElement = 0;
1351 }
1352 }
1353
1354 // offset base to the next component in the vertex to gather
1355 pStreamBase = GEP(pStreamBase, C((char)4));
1356 #else
1357 if (isComponentEnabled(compMask, i))
1358 {
1359 // if we need to gather the component
1360 if (compCtrl[i] == StoreSrc)
1361 {
1362 // Gather a SIMD of vertices
1363 // APIs allow a 4GB range for offsets
1364 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1365 // But, we know that elements must be aligned for FETCH. :)
1366 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1367 Value *vShiftedOffsets = LSHR(vOffsets, 1);
1368 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2);
1369 }
1370 else
1371 {
1372 #if USE_SIMD16_SHADERS
1373 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1374 #else
1375 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1376 #endif
1377 }
1378
1379 if (currentVertexElement > 3)
1380 {
1381 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1382 // reset to the next vVertexElement to output
1383 currentVertexElement = 0;
1384 }
1385 }
1386
1387 // offset base to the next component in the vertex to gather
1388 pStreamBase = GEP(pStreamBase, C((char)4));
1389 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1390 #endif
1391 }
1392 }
1393 break;
1394 case 64:
1395 {
1396 for (uint32_t i = 0; i < 4; i += 1)
1397 {
1398 #if USE_SIMD16_GATHERS
1399 if (isComponentEnabled(compMask, i))
1400 {
1401 // if we need to gather the component
1402 if (compCtrl[i] == StoreSrc)
1403 {
1404 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1405 Value *vMaskLo2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1406 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1407 Value *vMaskHi2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1408
1409 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1410 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1411 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1412 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1413
1414 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1415
1416 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1417 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1418 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1419 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1420
1421 pGatherLo = VCVTPD2PS(pGatherLo);
1422 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1423 pGatherHi = VCVTPD2PS(pGatherHi);
1424 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1425
1426 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1427 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1428
1429 // pack adjacent pairs of SIMD8s into SIMD16s
1430 pVtxSrc2[currentVertexElement++] = JOIN_16(pGather, pGather2);
1431 }
1432 else
1433 {
1434 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1435 }
1436
1437 if (currentVertexElement > 3)
1438 {
1439 // store SIMD16s
1440 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1441
1442 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1443 // reset to the next vVertexElement to output
1444 currentVertexElement = 0;
1445 }
1446 }
1447
1448 // offset base to the next component in the vertex to gather
1449 pStreamBase = GEP(pStreamBase, C((char)8));
1450 #else
1451 if (isComponentEnabled(compMask, i))
1452 {
1453 // if we need to gather the component
1454 if (compCtrl[i] == StoreSrc)
1455 {
1456 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1457 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1458
1459 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1460 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1461
1462 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1463
1464 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1465 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1466
1467 pGatherLo = VCVTPD2PS(pGatherLo);
1468 pGatherHi = VCVTPD2PS(pGatherHi);
1469
1470 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1471
1472 vVertexElements[currentVertexElement++] = pGather;
1473 }
1474 else
1475 {
1476 #if USE_SIMD16_SHADERS
1477 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1478 #else
1479 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1480 #endif
1481 }
1482
1483 if (currentVertexElement > 3)
1484 {
1485 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1486 // reset to the next vVertexElement to output
1487 currentVertexElement = 0;
1488 }
1489 }
1490
1491 // offset base to the next component in the vertex to gather
1492 pStreamBase = GEP(pStreamBase, C((char)8));
1493 #endif
1494 }
1495 }
1496 break;
1497 default:
1498 SWR_INVALID("Tried to fetch invalid FP format");
1499 break;
1500 }
1501 }
1502 else
1503 {
1504 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1505 ConversionType conversionType = CONVERT_NONE;
1506
1507 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1508 "Unsupported format for standard gather fetch.");
1509
1510 switch(info.type[0])
1511 {
1512 case SWR_TYPE_UNORM:
1513 conversionType = CONVERT_NORMALIZED;
1514 case SWR_TYPE_UINT:
1515 extendCastType = Instruction::CastOps::ZExt;
1516 break;
1517 case SWR_TYPE_SNORM:
1518 conversionType = CONVERT_NORMALIZED;
1519 case SWR_TYPE_SINT:
1520 extendCastType = Instruction::CastOps::SExt;
1521 break;
1522 case SWR_TYPE_USCALED:
1523 conversionType = CONVERT_USCALED;
1524 extendCastType = Instruction::CastOps::UIToFP;
1525 break;
1526 case SWR_TYPE_SSCALED:
1527 conversionType = CONVERT_SSCALED;
1528 extendCastType = Instruction::CastOps::SIToFP;
1529 break;
1530 case SWR_TYPE_SFIXED:
1531 conversionType = CONVERT_SFIXED;
1532 extendCastType = Instruction::CastOps::SExt;
1533 break;
1534 default:
1535 break;
1536 }
1537
1538 // value substituted when component of gather is masked
1539 Value* gatherSrc = VIMMED1(0);
1540 #if USE_SIMD16_GATHERS
1541 Value *gatherSrc16 = VIMMED1_16(0);
1542 #endif
1543
1544 // Gather components from memory to store in a simdvertex structure
1545 switch (bpc)
1546 {
1547 case 8:
1548 {
1549 // if we have at least one component to fetch
1550 if (compMask)
1551 {
1552 #if USE_SIMD16_GATHERS
1553 Value *gatherResult = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1554
1555 // e.g. result of an 8x32bit integer gather for 8bit components
1556 // 256i - 0 1 2 3 4 5 6 7
1557 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1558
1559 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1560
1561 Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1562 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
1563
1564 // Shuffle gathered components into place in simdvertex struct
1565 Shuffle8bpcGatherd16(args); // outputs to vVertexElements ref
1566 #else
1567 Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1568 // e.g. result of an 8x32bit integer gather for 8bit components
1569 // 256i - 0 1 2 3 4 5 6 7
1570 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1571
1572 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1573 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1574
1575 // Shuffle gathered components into place in simdvertex struct
1576 #if USE_SIMD16_SHADERS
1577 Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1578 #else
1579 Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1580 #endif
1581 #endif
1582 }
1583 }
1584 break;
1585 case 16:
1586 {
1587 #if USE_SIMD16_GATHERS
1588 Value *gatherResult[2];
1589
1590 // if we have at least one component out of x or y to fetch
1591 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1592 {
1593 gatherResult[0] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1594
1595 // e.g. result of first 8x32bit integer gather for 16bit components
1596 // 256i - 0 1 2 3 4 5 6 7
1597 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1598 //
1599 }
1600 else
1601 {
1602 gatherResult[0] = VUNDEF_I_16();
1603 }
1604
1605 // if we have at least one component out of z or w to fetch
1606 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1607 {
1608 // offset base to the next components(zw) in the vertex to gather
1609 pStreamBase = GEP(pStreamBase, C((char)4));
1610
1611 gatherResult[1] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1612
1613 // e.g. result of second 8x32bit integer gather for 16bit components
1614 // 256i - 0 1 2 3 4 5 6 7
1615 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1616 //
1617 }
1618 else
1619 {
1620 gatherResult[1] = VUNDEF_I_16();
1621 }
1622
1623 // if we have at least one component to shuffle into place
1624 if (compMask)
1625 {
1626 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1627
1628 Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1629 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1630
1631 // Shuffle gathered components into place in simdvertex struct
1632 Shuffle16bpcGather16(args); // outputs to vVertexElements ref
1633 }
1634 #else
1635 Value *vGatherResult[2];
1636
1637 // if we have at least one component out of x or y to fetch
1638 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1639 {
1640 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1641 // e.g. result of first 8x32bit integer gather for 16bit components
1642 // 256i - 0 1 2 3 4 5 6 7
1643 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1644 //
1645 }
1646
1647 // if we have at least one component out of z or w to fetch
1648 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1649 {
1650 // offset base to the next components(zw) in the vertex to gather
1651 pStreamBase = GEP(pStreamBase, C((char)4));
1652
1653 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1654 // e.g. result of second 8x32bit integer gather for 16bit components
1655 // 256i - 0 1 2 3 4 5 6 7
1656 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1657 //
1658 }
1659
1660 // if we have at least one component to shuffle into place
1661 if (compMask)
1662 {
1663 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1664 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1665
1666 // Shuffle gathered components into place in simdvertex struct
1667 #if USE_SIMD16_SHADERS
1668 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
1669 #else
1670 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1671 #endif
1672 }
1673 #endif
1674 }
1675 break;
1676 case 32:
1677 {
1678 // Gathered components into place in simdvertex struct
1679 for (uint32_t i = 0; i < 4; i++)
1680 {
1681 if (isComponentEnabled(compMask, i))
1682 {
1683 // if we need to gather the component
1684 if (compCtrl[i] == StoreSrc)
1685 {
1686 #if USE_SIMD16_GATHERS
1687 Value *pGather = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1688
1689 if (conversionType == CONVERT_USCALED)
1690 {
1691 pGather = UI_TO_FP(pGather, mSimd16FP32Ty);
1692 }
1693 else if (conversionType == CONVERT_SSCALED)
1694 {
1695 pGather = SI_TO_FP(pGather, mSimd16FP32Ty);
1696 }
1697 else if (conversionType == CONVERT_SFIXED)
1698 {
1699 pGather = FMUL(SI_TO_FP(pGather, mSimd16FP32Ty), VBROADCAST_16(C(1 / 65536.0f)));
1700 }
1701
1702 pVtxSrc2[currentVertexElement++] = pGather;
1703
1704 // e.g. result of a single 8x32bit integer gather for 32bit components
1705 // 256i - 0 1 2 3 4 5 6 7
1706 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1707 #else
1708 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1709
1710 if (conversionType == CONVERT_USCALED)
1711 {
1712 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1713 }
1714 else if (conversionType == CONVERT_SSCALED)
1715 {
1716 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1717 }
1718 else if (conversionType == CONVERT_SFIXED)
1719 {
1720 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1721 }
1722
1723 vVertexElements[currentVertexElement++] = pGather;
1724
1725 // e.g. result of a single 8x32bit integer gather for 32bit components
1726 // 256i - 0 1 2 3 4 5 6 7
1727 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1728 #endif
1729 }
1730 else
1731 {
1732 #if USE_SIMD16_GATHERS
1733 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1734 #else
1735 #if USE_SIMD16_SHADERS
1736 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1737 #else
1738 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1739 #endif
1740 #endif
1741 }
1742
1743 if (currentVertexElement > 3)
1744 {
1745 #if USE_SIMD16_GATHERS
1746 // store SIMD16s
1747 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1748
1749 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1750 #else
1751 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1752 #endif
1753
1754 // reset to the next vVertexElement to output
1755 currentVertexElement = 0;
1756 }
1757
1758 }
1759
1760 // offset base to the next component in the vertex to gather
1761 pStreamBase = GEP(pStreamBase, C((char)4));
1762 }
1763 }
1764 break;
1765 }
1766 }
1767 }
1768
1769 // if we have a partially filled vVertexElement struct, output it
1770 if (currentVertexElement > 0)
1771 {
1772 #if USE_SIMD16_GATHERS
1773 // store SIMD16s
1774 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1775
1776 StoreVertexElements16(pVtxOut2, outputElt++, currentVertexElement, pVtxSrc2);
1777 #else
1778 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1779 #endif
1780 }
1781 }
1782
1783 //////////////////////////////////////////////////////////////////////////
1784 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1785 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1786 /// support
1787 /// @param pIndices - pointer to 8 bit indices
1788 /// @param pLastIndex - pointer to last valid index
1789 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1790 {
1791 // can fit 2 16 bit integers per vWidth lane
1792 Value* vIndices = VUNDEF_I();
1793
1794 // store 0 index on stack to be used to conditionally load from if index address is OOB
1795 Value* pZeroIndex = ALLOCA(mInt8Ty);
1796 STORE(C((uint8_t)0), pZeroIndex);
1797
1798 // Load a SIMD of index pointers
1799 for(int64_t lane = 0; lane < mVWidth; lane++)
1800 {
1801 // Calculate the address of the requested index
1802 Value *pIndex = GEP(pIndices, C(lane));
1803
1804 // check if the address is less than the max index,
1805 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1806
1807 // if valid, load the index. if not, load 0 from the stack
1808 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1809 Value *index = LOAD(pValid, "valid index");
1810
1811 // zero extended index to 32 bits and insert into the correct simd lane
1812 index = Z_EXT(index, mInt32Ty);
1813 vIndices = VINSERT(vIndices, index, lane);
1814 }
1815 return vIndices;
1816 }
1817
1818 //////////////////////////////////////////////////////////////////////////
1819 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1820 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1821 /// support
1822 /// @param pIndices - pointer to 16 bit indices
1823 /// @param pLastIndex - pointer to last valid index
1824 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1825 {
1826 // can fit 2 16 bit integers per vWidth lane
1827 Value* vIndices = VUNDEF_I();
1828
1829 // store 0 index on stack to be used to conditionally load from if index address is OOB
1830 Value* pZeroIndex = ALLOCA(mInt16Ty);
1831 STORE(C((uint16_t)0), pZeroIndex);
1832
1833 // Load a SIMD of index pointers
1834 for(int64_t lane = 0; lane < mVWidth; lane++)
1835 {
1836 // Calculate the address of the requested index
1837 Value *pIndex = GEP(pIndices, C(lane));
1838
1839 // check if the address is less than the max index,
1840 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1841
1842 // if valid, load the index. if not, load 0 from the stack
1843 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1844 Value *index = LOAD(pValid, "valid index");
1845
1846 // zero extended index to 32 bits and insert into the correct simd lane
1847 index = Z_EXT(index, mInt32Ty);
1848 vIndices = VINSERT(vIndices, index, lane);
1849 }
1850 return vIndices;
1851 }
1852
1853 //////////////////////////////////////////////////////////////////////////
1854 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1855 /// @param pIndices - pointer to 32 bit indices
1856 /// @param pLastIndex - pointer to last valid index
1857 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1858 {
1859 DataLayout dL(JM()->mpCurrentModule);
1860 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1861 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1862 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1863
1864 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1865 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1866 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1867 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1868
1869 // create a vector of index counts from the base index ptr passed into the fetch
1870 const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1871 Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1872
1873 // compare index count to the max valid index
1874 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1875 // vIndexOffsets 0 1 2 3 4 5 6 7
1876 // ------------------------------
1877 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1878 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1879 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1880 Value* vIndexMask = VPCMPGTD(vMaxIndex, vIndexOffsets);
1881
1882 // VMASKLOAD takes an *i8 src pointer
1883 pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1884
1885 // Load the indices; OOB loads 0
1886 return MASKLOADD(pIndices,vIndexMask);
1887 }
1888
1889 //////////////////////////////////////////////////////////////////////////
1890 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1891 /// denormalizes if needed, converts to F32 if needed, and positions in
1892 // the proper SIMD rows to be output to the simdvertex structure
1893 /// @param args: (tuple of args, listed below)
1894 /// @param vGatherResult - 8 gathered 8bpc vertices
1895 /// @param pVtxOut - base pointer to output simdvertex struct
1896 /// @param extendType - sign extend or zero extend
1897 /// @param bNormalized - do we need to denormalize?
1898 /// @param currentVertexElement - reference to the current vVertexElement
1899 /// @param outputElt - reference to the current offset from simdvertex we're o
1900 /// @param compMask - component packing mask
1901 /// @param compCtrl - component control val
1902 /// @param vVertexElements[4] - vertex components to output
1903 /// @param swizzle[4] - component swizzle location
1904 #if USE_SIMD16_GATHERS
1905 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
1906 {
1907 // Unpack tuple args
1908 Value*& vGatherResult = std::get<0>(args);
1909 Value* pVtxOut = std::get<1>(args);
1910 const Instruction::CastOps extendType = std::get<2>(args);
1911 const ConversionType conversionType = std::get<3>(args);
1912 uint32_t &currentVertexElement = std::get<4>(args);
1913 uint32_t &outputElt = std::get<5>(args);
1914 const ComponentEnable compMask = std::get<6>(args);
1915 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1916 Value* (&vVertexElements)[4] = std::get<8>(args);
1917 const uint32_t(&swizzle)[4] = std::get<9>(args);
1918
1919 // cast types
1920 Type *vGatherTy = mSimdInt32Ty;
1921 Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1922
1923 // have to do extra work for sign extending
1924 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1925 {
1926 Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1927 Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1928
1929 // shuffle mask, including any swizzling
1930 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1931 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1932 Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
1933 char(y), char(y + 4), char(y + 8), char(y + 12),
1934 char(z), char(z + 4), char(z + 8), char(z + 12),
1935 char(w), char(w + 4), char(w + 8), char(w + 12),
1936 char(x), char(x + 4), char(x + 8), char(x + 12),
1937 char(y), char(y + 4), char(y + 8), char(y + 12),
1938 char(z), char(z + 4), char(z + 8), char(z + 12),
1939 char(w), char(w + 4), char(w + 8), char(w + 12) });
1940
1941 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1942
1943 Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1944 Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1945
1946 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1947 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1948
1949 // after pshufb: group components together in each 128bit lane
1950 // 256i - 0 1 2 3 4 5 6 7
1951 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1952
1953 Value *vi128XY_lo = nullptr;
1954 Value *vi128XY_hi = nullptr;
1955 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1956 {
1957 vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1958 vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1959
1960 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1961 // 256i - 0 1 2 3 4 5 6 7
1962 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1963 }
1964
1965 // do the same for zw components
1966 Value *vi128ZW_lo = nullptr;
1967 Value *vi128ZW_hi = nullptr;
1968 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1969 {
1970 vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1971 vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1972 }
1973
1974 // init denormalize variables if needed
1975 Instruction::CastOps fpCast;
1976 Value *conversionFactor;
1977
1978 switch (conversionType)
1979 {
1980 case CONVERT_NORMALIZED:
1981 fpCast = Instruction::CastOps::SIToFP;
1982 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1983 break;
1984 case CONVERT_SSCALED:
1985 fpCast = Instruction::CastOps::SIToFP;
1986 conversionFactor = VIMMED1((float)(1.0));
1987 break;
1988 case CONVERT_USCALED:
1989 SWR_INVALID("Type should not be sign extended!");
1990 conversionFactor = nullptr;
1991 break;
1992 default:
1993 SWR_ASSERT(conversionType == CONVERT_NONE);
1994 conversionFactor = nullptr;
1995 break;
1996 }
1997
1998 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1999 for (uint32_t i = 0; i < 4; i++)
2000 {
2001 if (isComponentEnabled(compMask, i))
2002 {
2003 if (compCtrl[i] == ComponentControl::StoreSrc)
2004 {
2005 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2006 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2007 // if x or y, use vi128XY permute result, else use vi128ZW
2008 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2009 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2010
2011 // sign extend
2012 Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
2013 Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
2014
2015 // denormalize if needed
2016 if (conversionType != CONVERT_NONE)
2017 {
2018 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2019 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2020 }
2021
2022 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2023
2024 currentVertexElement += 1;
2025 }
2026 else
2027 {
2028 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2029 }
2030
2031 if (currentVertexElement > 3)
2032 {
2033 StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2034 // reset to the next vVertexElement to output
2035 currentVertexElement = 0;
2036 }
2037 }
2038 }
2039 }
2040 // else zero extend
2041 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2042 {
2043 // init denormalize variables if needed
2044 Instruction::CastOps fpCast;
2045 Value *conversionFactor;
2046
2047 switch (conversionType)
2048 {
2049 case CONVERT_NORMALIZED:
2050 fpCast = Instruction::CastOps::UIToFP;
2051 conversionFactor = VIMMED1((float)(1.0 / 255.0));
2052 break;
2053 case CONVERT_USCALED:
2054 fpCast = Instruction::CastOps::UIToFP;
2055 conversionFactor = VIMMED1((float)(1.0));
2056 break;
2057 case CONVERT_SSCALED:
2058 SWR_INVALID("Type should not be zero extended!");
2059 conversionFactor = nullptr;
2060 break;
2061 default:
2062 SWR_ASSERT(conversionType == CONVERT_NONE);
2063 conversionFactor = nullptr;
2064 break;
2065 }
2066
2067 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2068 for (uint32_t i = 0; i < 4; i++)
2069 {
2070 if (isComponentEnabled(compMask, i))
2071 {
2072 if (compCtrl[i] == ComponentControl::StoreSrc)
2073 {
2074 // pshufb masks for each component
2075 Value *vConstMask;
2076 switch (swizzle[i])
2077 {
2078 case 0:
2079 // x shuffle mask
2080 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2081 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2082 break;
2083 case 1:
2084 // y shuffle mask
2085 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2086 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2087 break;
2088 case 2:
2089 // z shuffle mask
2090 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2091 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2092 break;
2093 case 3:
2094 // w shuffle mask
2095 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2096 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2097 break;
2098 default:
2099 vConstMask = nullptr;
2100 break;
2101 }
2102
2103 Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
2104 Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
2105
2106 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2107 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2108
2109 // after pshufb for x channel
2110 // 256i - 0 1 2 3 4 5 6 7
2111 // x000 x000 x000 x000 x000 x000 x000 x000
2112
2113 // denormalize if needed
2114 if (conversionType != CONVERT_NONE)
2115 {
2116 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2117 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2118 }
2119
2120 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2121
2122 currentVertexElement += 1;
2123 }
2124 else
2125 {
2126 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2127 }
2128
2129 if (currentVertexElement > 3)
2130 {
2131 StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2132 // reset to the next vVertexElement to output
2133 currentVertexElement = 0;
2134 }
2135 }
2136 }
2137 }
2138 else
2139 {
2140 SWR_INVALID("Unsupported conversion type");
2141 }
2142 }
2143
2144 #else
2145 #if USE_SIMD16_SHADERS
2146 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
2147 #else
2148 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
2149 #endif
2150 {
2151 // Unpack tuple args
2152 Value*& vGatherResult = std::get<0>(args);
2153 Value* pVtxOut = std::get<1>(args);
2154 const Instruction::CastOps extendType = std::get<2>(args);
2155 const ConversionType conversionType = std::get<3>(args);
2156 uint32_t &currentVertexElement = std::get<4>(args);
2157 uint32_t &outputElt = std::get<5>(args);
2158 const ComponentEnable compMask = std::get<6>(args);
2159 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2160 Value* (&vVertexElements)[4] = std::get<8>(args);
2161 const uint32_t(&swizzle)[4] = std::get<9>(args);
2162
2163 // cast types
2164 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2165
2166 for (uint32_t i = 0; i < 4; i++)
2167 {
2168 if (!isComponentEnabled(compMask, i))
2169 continue;
2170
2171 if (compCtrl[i] == ComponentControl::StoreSrc)
2172 {
2173 std::vector<uint32_t> vShuffleMasks[4] = {
2174 { 0, 4, 8, 12, 16, 20, 24, 28 }, // x
2175 { 1, 5, 9, 13, 17, 21, 25, 29 }, // y
2176 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
2177 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
2178 };
2179
2180 Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
2181 UndefValue::get(v32x8Ty),
2182 vShuffleMasks[swizzle[i]]);
2183
2184 if ((extendType == Instruction::CastOps::SExt) ||
2185 (extendType == Instruction::CastOps::SIToFP)) {
2186 switch (conversionType)
2187 {
2188 case CONVERT_NORMALIZED:
2189 val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
2190 break;
2191 case CONVERT_SSCALED:
2192 val = SI_TO_FP(val, mSimdFP32Ty);
2193 break;
2194 case CONVERT_USCALED:
2195 SWR_INVALID("Type should not be sign extended!");
2196 break;
2197 default:
2198 SWR_ASSERT(conversionType == CONVERT_NONE);
2199 val = S_EXT(val, mSimdInt32Ty);
2200 break;
2201 }
2202 }
2203 else if ((extendType == Instruction::CastOps::ZExt) ||
2204 (extendType == Instruction::CastOps::UIToFP)) {
2205 switch (conversionType)
2206 {
2207 case CONVERT_NORMALIZED:
2208 val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
2209 break;
2210 case CONVERT_SSCALED:
2211 SWR_INVALID("Type should not be zero extended!");
2212 break;
2213 case CONVERT_USCALED:
2214 val = UI_TO_FP(val, mSimdFP32Ty);
2215 break;
2216 default:
2217 SWR_ASSERT(conversionType == CONVERT_NONE);
2218 val = Z_EXT(val, mSimdInt32Ty);
2219 break;
2220 }
2221 }
2222 else
2223 {
2224 SWR_INVALID("Unsupported conversion type");
2225 }
2226
2227 vVertexElements[currentVertexElement++] = val;
2228 }
2229 else
2230 {
2231 #if USE_SIMD16_SHADERS
2232 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2233 #else
2234 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2235 #endif
2236 }
2237
2238 if (currentVertexElement > 3)
2239 {
2240 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2241 // reset to the next vVertexElement to output
2242 currentVertexElement = 0;
2243 }
2244 }
2245 }
2246
2247 #endif
2248 //////////////////////////////////////////////////////////////////////////
2249 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2250 /// denormalizes if needed, converts to F32 if needed, and positions in
2251 // the proper SIMD rows to be output to the simdvertex structure
2252 /// @param args: (tuple of args, listed below)
2253 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2254 /// @param pVtxOut - base pointer to output simdvertex struct
2255 /// @param extendType - sign extend or zero extend
2256 /// @param bNormalized - do we need to denormalize?
2257 /// @param currentVertexElement - reference to the current vVertexElement
2258 /// @param outputElt - reference to the current offset from simdvertex we're o
2259 /// @param compMask - component packing mask
2260 /// @param compCtrl - component control val
2261 /// @param vVertexElements[4] - vertex components to output
2262 #if USE_SIMD16_GATHERS
2263 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
2264 {
2265 // Unpack tuple args
2266 Value* (&vGatherResult)[2] = std::get<0>(args);
2267 Value* pVtxOut = std::get<1>(args);
2268 const Instruction::CastOps extendType = std::get<2>(args);
2269 const ConversionType conversionType = std::get<3>(args);
2270 uint32_t &currentVertexElement = std::get<4>(args);
2271 uint32_t &outputElt = std::get<5>(args);
2272 const ComponentEnable compMask = std::get<6>(args);
2273 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2274 Value* (&vVertexElements)[4] = std::get<8>(args);
2275
2276 // cast types
2277 Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2278 Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2279
2280 // have to do extra work for sign extending
2281 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
2282 {
2283 // is this PP float?
2284 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2285
2286 Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2287 Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2288
2289 // shuffle mask
2290 Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2291 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2292 Value *vi128XY_lo = nullptr;
2293 Value *vi128XY_hi = nullptr;
2294 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2295 {
2296 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2297
2298 Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0);
2299 Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1);
2300
2301 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2302 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2303
2304 // after pshufb: group components together in each 128bit lane
2305 // 256i - 0 1 2 3 4 5 6 7
2306 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2307
2308 vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2309 vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2310
2311 // after PERMD: move and pack xy components into each 128bit lane
2312 // 256i - 0 1 2 3 4 5 6 7
2313 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2314 }
2315
2316 // do the same for zw components
2317 Value *vi128ZW_lo = nullptr;
2318 Value *vi128ZW_hi = nullptr;
2319 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2320 {
2321 Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0);
2322 Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1);
2323
2324 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2325 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2326
2327 vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2328 vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2329 }
2330
2331 // init denormalize variables if needed
2332 Instruction::CastOps IntToFpCast;
2333 Value *conversionFactor;
2334
2335 switch (conversionType)
2336 {
2337 case CONVERT_NORMALIZED:
2338 IntToFpCast = Instruction::CastOps::SIToFP;
2339 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2340 break;
2341 case CONVERT_SSCALED:
2342 IntToFpCast = Instruction::CastOps::SIToFP;
2343 conversionFactor = VIMMED1((float)(1.0));
2344 break;
2345 case CONVERT_USCALED:
2346 SWR_INVALID("Type should not be sign extended!");
2347 conversionFactor = nullptr;
2348 break;
2349 default:
2350 SWR_ASSERT(conversionType == CONVERT_NONE);
2351 conversionFactor = nullptr;
2352 break;
2353 }
2354
2355 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2356 for (uint32_t i = 0; i < 4; i++)
2357 {
2358 if (isComponentEnabled(compMask, i))
2359 {
2360 if (compCtrl[i] == ComponentControl::StoreSrc)
2361 {
2362 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2363 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2364 // if x or y, use vi128XY permute result, else use vi128ZW
2365 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2366 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2367
2368 if (bFP)
2369 {
2370 // extract 128 bit lanes to sign extend each component
2371 Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2372 Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2373
2374 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2375 }
2376 else
2377 {
2378 // extract 128 bit lanes to sign extend each component
2379 Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2380 Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2381
2382 // denormalize if needed
2383 if (conversionType != CONVERT_NONE)
2384 {
2385 temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2386 temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2387 }
2388
2389 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2390 }
2391
2392 currentVertexElement += 1;
2393 }
2394 else
2395 {
2396 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2397 }
2398
2399 if (currentVertexElement > 3)
2400 {
2401 StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2402 // reset to the next vVertexElement to output
2403 currentVertexElement = 0;
2404 }
2405 }
2406 }
2407 }
2408 // else zero extend
2409 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2410 {
2411 // pshufb masks for each component
2412 Value *vConstMask[2];
2413
2414 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2415 {
2416 // x/z shuffle mask
2417 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2418 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2419 }
2420
2421 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2422 {
2423 // y/w shuffle mask
2424 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2425 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2426 }
2427
2428 // init denormalize variables if needed
2429 Instruction::CastOps fpCast;
2430 Value* conversionFactor;
2431
2432 switch (conversionType)
2433 {
2434 case CONVERT_NORMALIZED:
2435 fpCast = Instruction::CastOps::UIToFP;
2436 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2437 break;
2438 case CONVERT_USCALED:
2439 fpCast = Instruction::CastOps::UIToFP;
2440 conversionFactor = VIMMED1((float)(1.0f));
2441 break;
2442 case CONVERT_SSCALED:
2443 SWR_INVALID("Type should not be zero extended!");
2444 conversionFactor = nullptr;
2445 break;
2446 default:
2447 SWR_ASSERT(conversionType == CONVERT_NONE);
2448 conversionFactor = nullptr;
2449 break;
2450 }
2451
2452 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2453 for (uint32_t i = 0; i < 4; i++)
2454 {
2455 if (isComponentEnabled(compMask, i))
2456 {
2457 if (compCtrl[i] == ComponentControl::StoreSrc)
2458 {
2459 // select correct constMask for x/z or y/w pshufb
2460 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2461 // if x or y, use vi128XY permute result, else use vi128ZW
2462 uint32_t selectedGather = (i < 2) ? 0 : 1;
2463
2464 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2465
2466 Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
2467 Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
2468
2469 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2470 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2471
2472 // after pshufb mask for x channel; z uses the same shuffle from the second gather
2473 // 256i - 0 1 2 3 4 5 6 7
2474 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2475
2476 // denormalize if needed
2477 if (conversionType != CONVERT_NONE)
2478 {
2479 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2480 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2481 }
2482
2483 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2484
2485 currentVertexElement += 1;
2486 }
2487 else
2488 {
2489 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2490 }
2491
2492 if (currentVertexElement > 3)
2493 {
2494 StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2495 // reset to the next vVertexElement to output
2496 currentVertexElement = 0;
2497 }
2498 }
2499 }
2500 }
2501 else
2502 {
2503 SWR_INVALID("Unsupported conversion type");
2504 }
2505 }
2506
2507 #else
2508 #if USE_SIMD16_SHADERS
2509 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2510 #else
2511 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2512 #endif
2513 {
2514 // Unpack tuple args
2515 Value* (&vGatherResult)[2] = std::get<0>(args);
2516 Value* pVtxOut = std::get<1>(args);
2517 const Instruction::CastOps extendType = std::get<2>(args);
2518 const ConversionType conversionType = std::get<3>(args);
2519 uint32_t &currentVertexElement = std::get<4>(args);
2520 uint32_t &outputElt = std::get<5>(args);
2521 const ComponentEnable compMask = std::get<6>(args);
2522 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2523 Value* (&vVertexElements)[4] = std::get<8>(args);
2524
2525 // cast types
2526 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2527 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2528
2529 // have to do extra work for sign extending
2530 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
2531 (extendType == Instruction::CastOps::FPExt))
2532 {
2533 // is this PP float?
2534 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2535
2536 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2537 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2538
2539 // shuffle mask
2540 Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2541 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2542 Value* vi128XY = nullptr;
2543 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
2544 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2545 // after pshufb: group components together in each 128bit lane
2546 // 256i - 0 1 2 3 4 5 6 7
2547 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2548
2549 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2550 // after PERMD: move and pack xy components into each 128bit lane
2551 // 256i - 0 1 2 3 4 5 6 7
2552 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2553 }
2554
2555 // do the same for zw components
2556 Value* vi128ZW = nullptr;
2557 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
2558 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2559 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2560 }
2561
2562 // init denormalize variables if needed
2563 Instruction::CastOps IntToFpCast;
2564 Value* conversionFactor;
2565
2566 switch (conversionType)
2567 {
2568 case CONVERT_NORMALIZED:
2569 IntToFpCast = Instruction::CastOps::SIToFP;
2570 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2571 break;
2572 case CONVERT_SSCALED:
2573 IntToFpCast = Instruction::CastOps::SIToFP;
2574 conversionFactor = VIMMED1((float)(1.0));
2575 break;
2576 case CONVERT_USCALED:
2577 SWR_INVALID("Type should not be sign extended!");
2578 conversionFactor = nullptr;
2579 break;
2580 default:
2581 SWR_ASSERT(conversionType == CONVERT_NONE);
2582 conversionFactor = nullptr;
2583 break;
2584 }
2585
2586 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2587 for (uint32_t i = 0; i < 4; i++)
2588 {
2589 if (isComponentEnabled(compMask, i))
2590 {
2591 if (compCtrl[i] == ComponentControl::StoreSrc)
2592 {
2593 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2594 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2595 // if x or y, use vi128XY permute result, else use vi128ZW
2596 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2597
2598 if (bFP) {
2599 // extract 128 bit lanes to sign extend each component
2600 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2601 }
2602 else {
2603 // extract 128 bit lanes to sign extend each component
2604 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2605
2606 // denormalize if needed
2607 if (conversionType != CONVERT_NONE) {
2608 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2609 }
2610 }
2611 currentVertexElement++;
2612 }
2613 else
2614 {
2615 #if USE_SIMD16_SHADERS
2616 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2617 #else
2618 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2619 #endif
2620 }
2621
2622 if (currentVertexElement > 3)
2623 {
2624 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2625 // reset to the next vVertexElement to output
2626 currentVertexElement = 0;
2627 }
2628 }
2629 }
2630 }
2631 // else zero extend
2632 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2633 {
2634 // pshufb masks for each component
2635 Value* vConstMask[2];
2636 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
2637 // x/z shuffle mask
2638 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2639 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2640 }
2641
2642 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
2643 // y/w shuffle mask
2644 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2645 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2646 }
2647
2648 // init denormalize variables if needed
2649 Instruction::CastOps fpCast;
2650 Value* conversionFactor;
2651
2652 switch (conversionType)
2653 {
2654 case CONVERT_NORMALIZED:
2655 fpCast = Instruction::CastOps::UIToFP;
2656 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2657 break;
2658 case CONVERT_USCALED:
2659 fpCast = Instruction::CastOps::UIToFP;
2660 conversionFactor = VIMMED1((float)(1.0f));
2661 break;
2662 case CONVERT_SSCALED:
2663 SWR_INVALID("Type should not be zero extended!");
2664 conversionFactor = nullptr;
2665 break;
2666 default:
2667 SWR_ASSERT(conversionType == CONVERT_NONE);
2668 conversionFactor = nullptr;
2669 break;
2670 }
2671
2672 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2673 for (uint32_t i = 0; i < 4; i++)
2674 {
2675 if (isComponentEnabled(compMask, i))
2676 {
2677 if (compCtrl[i] == ComponentControl::StoreSrc)
2678 {
2679 // select correct constMask for x/z or y/w pshufb
2680 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2681 // if x or y, use vi128XY permute result, else use vi128ZW
2682 uint32_t selectedGather = (i < 2) ? 0 : 1;
2683
2684 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2685 // after pshufb mask for x channel; z uses the same shuffle from the second gather
2686 // 256i - 0 1 2 3 4 5 6 7
2687 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2688
2689 // denormalize if needed
2690 if (conversionType != CONVERT_NONE)
2691 {
2692 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2693 }
2694 currentVertexElement++;
2695 }
2696 else
2697 {
2698 #if USE_SIMD16_SHADERS
2699 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2700 #else
2701 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2702 #endif
2703 }
2704
2705 if (currentVertexElement > 3)
2706 {
2707 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2708 // reset to the next vVertexElement to output
2709 currentVertexElement = 0;
2710 }
2711 }
2712 }
2713 }
2714 else
2715 {
2716 SWR_INVALID("Unsupported conversion type");
2717 }
2718 }
2719
2720 #endif
2721 //////////////////////////////////////////////////////////////////////////
2722 /// @brief Output a simdvertex worth of elements to the current outputElt
2723 /// @param pVtxOut - base address of VIN output struct
2724 /// @param outputElt - simdvertex offset in VIN to write to
2725 /// @param numEltsToStore - number of simdvertex rows to write out
2726 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2727 #if USE_SIMD16_GATHERS
2728 void FetchJit::StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2729 {
2730 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2731
2732 for (uint32_t c = 0; c < numEltsToStore; ++c)
2733 {
2734 // STORE expects FP32 x vWidth type, just bitcast if needed
2735 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2736 {
2737 #if FETCH_DUMP_VERTEX
2738 PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2739 #endif
2740 vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty);
2741 }
2742 #if FETCH_DUMP_VERTEX
2743 else
2744 {
2745 PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2746 }
2747 #endif
2748 // outputElt * 4 = offsetting by the size of a simdvertex
2749 // + c offsets to a 32bit x vWidth row within the current vertex
2750 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2751 STORE(vVertexElements[c], dest);
2752 }
2753 }
2754
2755 #else
2756 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2757 {
2758 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2759
2760 for (uint32_t c = 0; c < numEltsToStore; ++c)
2761 {
2762 // STORE expects FP32 x vWidth type, just bitcast if needed
2763 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2764 {
2765 #if FETCH_DUMP_VERTEX
2766 PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2767 #endif
2768 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2769 }
2770 #if FETCH_DUMP_VERTEX
2771 else
2772 {
2773 PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2774 }
2775 #endif
2776 // outputElt * 4 = offsetting by the size of a simdvertex
2777 // + c offsets to a 32bit x vWidth row within the current vertex
2778 #if USE_SIMD16_SHADERS
2779 Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
2780 #else
2781 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2782 #endif
2783 STORE(vVertexElements[c], dest);
2784 }
2785 }
2786
2787 #endif
2788 //////////////////////////////////////////////////////////////////////////
2789 /// @brief Generates a constant vector of values based on the
2790 /// ComponentControl value
2791 /// @param ctrl - ComponentControl value
2792 #if USE_SIMD16_GATHERS
2793 Value *FetchJit::GenerateCompCtrlVector16(const ComponentControl ctrl)
2794 {
2795 switch (ctrl)
2796 {
2797 case NoStore:
2798 return VUNDEF_I_16();
2799 case Store0:
2800 return VIMMED1_16(0);
2801 case Store1Fp:
2802 return VIMMED1_16(1.0f);
2803 case Store1Int:
2804 return VIMMED1_16(1);
2805 case StoreVertexId:
2806 {
2807 Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2808 Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2809
2810 Value *pId = JOIN_16(pId_lo, pId_hi);
2811
2812 return pId;
2813 }
2814 case StoreInstanceId:
2815 {
2816 Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2817 return VBROADCAST_16(pId);
2818 }
2819
2820
2821 case StoreSrc:
2822 default:
2823 SWR_INVALID("Invalid component control");
2824 return VUNDEF_I_16();
2825 }
2826 }
2827
2828 #else
2829 #if USE_SIMD16_SHADERS
2830 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
2831 #else
2832 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2833 #endif
2834 {
2835 switch (ctrl)
2836 {
2837 case NoStore:
2838 return VUNDEF_I();
2839 case Store0:
2840 return VIMMED1(0);
2841 case Store1Fp:
2842 return VIMMED1(1.0f);
2843 case Store1Int:
2844 return VIMMED1(1);
2845 case StoreVertexId:
2846 {
2847 #if USE_SIMD16_SHADERS
2848 Value *pId;
2849 if (useVertexID2)
2850 {
2851 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2852 }
2853 else
2854 {
2855 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2856 }
2857 #else
2858 Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2859 #endif
2860 return pId;
2861 }
2862 case StoreInstanceId:
2863 {
2864 Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2865 return VBROADCAST(pId);
2866 }
2867
2868
2869 case StoreSrc:
2870 default:
2871 SWR_INVALID("Invalid component control");
2872 return VUNDEF_I();
2873 }
2874 }
2875
2876 #endif
2877 //////////////////////////////////////////////////////////////////////////
2878 /// @brief Returns the enable mask for the specified component.
2879 /// @param enableMask - enable bits
2880 /// @param component - component to check if enabled.
2881 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2882 {
2883 switch (component)
2884 {
2885 // X
2886 case 0: return (enableMask & ComponentEnable::X);
2887 // Y
2888 case 1: return (enableMask & ComponentEnable::Y);
2889 // Z
2890 case 2: return (enableMask & ComponentEnable::Z);
2891 // W
2892 case 3: return (enableMask & ComponentEnable::W);
2893
2894 default: return false;
2895 }
2896 }
2897
2898 // Don't want two threads compiling the same fetch shader simultaneously
2899 // Has problems in the JIT cache implementation
2900 // This is only a problem for fetch right now.
2901 static std::mutex gFetchCodegenMutex;
2902
2903 //////////////////////////////////////////////////////////////////////////
2904 /// @brief JITs from fetch shader IR
2905 /// @param hJitMgr - JitManager handle
2906 /// @param func - LLVM function IR
2907 /// @return PFN_FETCH_FUNC - pointer to fetch code
2908 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2909 {
2910 const llvm::Function* func = (const llvm::Function*)hFunc;
2911 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2912 PFN_FETCH_FUNC pfnFetch;
2913
2914 gFetchCodegenMutex.lock();
2915 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2916 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2917 pJitMgr->mIsModuleFinalized = true;
2918
2919 #if defined(KNOB_SWRC_TRACING)
2920 char fName[1024];
2921 const char *funcName = func->getName().data();
2922 sprintf(fName, "%s.bin", funcName);
2923 FILE *fd = fopen(fName, "wb");
2924 fwrite((void *)pfnFetch, 1, 2048, fd);
2925 fclose(fd);
2926 #endif
2927
2928 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2929 gFetchCodegenMutex.unlock();
2930
2931
2932
2933 return pfnFetch;
2934 }
2935
2936 //////////////////////////////////////////////////////////////////////////
2937 /// @brief JIT compiles fetch shader
2938 /// @param hJitMgr - JitManager handle
2939 /// @param state - fetch state to build function from
2940 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2941 {
2942 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2943
2944 pJitMgr->SetupNewModule();
2945
2946 FetchJit theJit(pJitMgr);
2947 HANDLE hFunc = theJit.Create(state);
2948
2949 return JitFetchFunc(hJitMgr, hFunc);
2950 }