4f56bda83dfc40a5acbfff2ff5d0756fcdddf831
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "jit_api.h"
33 #include "fetch_jit.h"
34 #include "gen_state_llvm.h"
35
36 //#define FETCH_DUMP_VERTEX 1
37 using namespace llvm;
38 using namespace SwrJit;
39
40 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
41
42 enum ConversionType
43 {
44 CONVERT_NONE,
45 CONVERT_NORMALIZED,
46 CONVERT_USCALED,
47 CONVERT_SSCALED,
48 CONVERT_SFIXED,
49 };
50
51 #if USE_SIMD16_SHADERS
52 #define USE_SIMD16_GATHERS 0
53 #endif
54
55 //////////////////////////////////////////////////////////////////////////
56 /// Interface to Jitting a fetch shader
57 //////////////////////////////////////////////////////////////////////////
58 struct FetchJit :
59 public Builder
60 {
61 FetchJit(JitManager* pJitMgr) :
62 Builder(pJitMgr)
63 {}
64
65 Function* Create(const FETCH_COMPILE_STATE& fetchState);
66
67 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
68 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
69 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
70
71 // package up Shuffle*bpcGatherd args into a tuple for convenience
72 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
73 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
74 const uint32_t(&)[4]> Shuffle8bpcArgs;
75
76 #if USE_SIMD16_SHADERS
77 #if USE_SIMD16_GATHERS
78 void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
79 #else
80 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
81 #endif
82 #else
83 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
84 #endif
85
86 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
87 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
88
89 #if USE_SIMD16_SHADERS
90 #if USE_SIMD16_GATHERS
91 void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
92 #else
93 void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
94 #endif
95 #else
96 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
97 #endif
98
99 #if USE_SIMD16_GATHERS
100 void StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
101 #else
102 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
103 #endif
104
105 #if USE_SIMD16_SHADERS
106 #if USE_SIMD16_GATHERS
107 Value *GenerateCompCtrlVector16(const ComponentControl ctrl);
108 #else
109 Value *GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
110 #endif
111 #else
112 Value *GenerateCompCtrlVector(const ComponentControl ctrl);
113 #endif
114
115 void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
116
117 #if USE_SIMD16_SHADERS
118 #if USE_SIMD16_GATHERS
119 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
120 #else
121 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
122 #endif
123 #else
124 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
125 #endif
126
127 bool IsOddFormat(SWR_FORMAT format);
128 bool IsUniformFormat(SWR_FORMAT format);
129 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
130 void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
131 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
132
133 Value* mpFetchInfo;
134 };
135
136 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
137 {
138 std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
139 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
140
141 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
142 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
143
144 fetch->getParent()->setModuleIdentifier(fetch->getName());
145
146 IRB()->SetInsertPoint(entry);
147
148 auto argitr = fetch->arg_begin();
149
150 // Fetch shader arguments
151 mpPrivateContext = &*argitr; ++argitr;
152 mpPrivateContext->setName("privateContext");
153
154 mpFetchInfo = &*argitr; ++argitr;
155 mpFetchInfo->setName("fetchInfo");
156 Value* pVtxOut = &*argitr;
157 pVtxOut->setName("vtxOutput");
158 // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
159 // index 0(just the pointer to the simdvertex structure
160 // index 1(which element of the simdvertex structure to offset to(in this case 0)
161 // so the indices being i32's doesn't matter
162 // TODO: generated this GEP with a VECTOR structure type so this makes sense
163 std::vector<Value*> vtxInputIndices(2, C(0));
164 // GEP
165 pVtxOut = GEP(pVtxOut, C(0));
166 #if USE_SIMD16_SHADERS
167 #if 0// USE_SIMD16_BUILDER
168 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
169 #else
170 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
171 #endif
172 #else
173 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
174 #endif
175
176 // SWR_FETCH_CONTEXT::pStreams
177 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
178 streams->setName("pStreams");
179
180 // SWR_FETCH_CONTEXT::pIndices
181 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
182 indices->setName("pIndices");
183
184 // SWR_FETCH_CONTEXT::pLastIndex
185 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
186 pLastIndex->setName("pLastIndex");
187
188
189 Value* vIndices;
190 #if USE_SIMD16_SHADERS
191 Value* indices2;
192 Value* vIndices2;
193 #endif
194 switch(fetchState.indexType)
195 {
196 case R8_UINT:
197 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
198 #if USE_SIMD16_SHADERS
199 indices2 = GEP(indices, C(8));
200 #endif
201 if(fetchState.bDisableIndexOOBCheck)
202 {
203 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
204 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
205 #if USE_SIMD16_SHADERS
206 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
207 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
208 #endif
209 }
210 else
211 {
212 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
213 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
214 #if USE_SIMD16_SHADERS
215 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
216 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
217 #endif
218 }
219 break;
220 case R16_UINT:
221 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
222 #if USE_SIMD16_SHADERS
223 indices2 = GEP(indices, C(8));
224 #endif
225 if(fetchState.bDisableIndexOOBCheck)
226 {
227 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
228 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
229 #if USE_SIMD16_SHADERS
230 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
231 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
232 #endif
233 }
234 else
235 {
236 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
237 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
238 #if USE_SIMD16_SHADERS
239 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
240 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
241 #endif
242 }
243 break;
244 case R32_UINT:
245 #if USE_SIMD16_SHADERS
246 indices2 = GEP(indices, C(8));
247 #endif
248 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
249 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
250 #if USE_SIMD16_SHADERS
251 (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
252 : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
253 #endif
254 break; // incoming type is already 32bit int
255 default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
256 }
257
258 if(fetchState.bForceSequentialAccessEnable)
259 {
260 Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
261
262 // VertexData buffers are accessed sequentially, the index is equal to the vertex number
263 vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
264 vIndices = ADD(vIndices, pOffsets);
265 #if USE_SIMD16_SHADERS
266 vIndices2 = ADD(vIndices, VIMMED1(8));
267 #endif
268 }
269
270 Value* vVertexId = vIndices;
271 #if USE_SIMD16_SHADERS
272 Value* vVertexId2 = vIndices2;
273 #endif
274 if (fetchState.bVertexIDOffsetEnable)
275 {
276 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
277 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
278 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
279 vVertexId = ADD(vIndices, vBaseVertex);
280 vVertexId = ADD(vVertexId, vStartVertex);
281 #if USE_SIMD16_SHADERS
282 vVertexId2 = ADD(vIndices2, vBaseVertex);
283 vVertexId2 = ADD(vVertexId2, vStartVertex);
284 #endif
285 }
286
287 // store out vertex IDs
288 STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
289 #if USE_SIMD16_SHADERS
290 STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
291 #endif
292
293 // store out cut mask if enabled
294 if (fetchState.bEnableCutIndex)
295 {
296 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
297 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
298 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
299 #if USE_SIMD16_SHADERS
300 Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
301 STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
302 #endif
303 }
304
305 // Fetch attributes from memory and output to a simdvertex struct
306 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
307 #if USE_SIMD16_SHADERS
308 if (fetchState.bDisableVGATHER)
309 {
310 JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
311 JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
312 }
313 else
314 {
315 #if USE_SIMD16_GATHERS
316 JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
317 #else
318 JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
319 JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
320 #endif
321 }
322 #else
323 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
324 : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
325 #endif
326
327 RET_VOID();
328
329 JitManager::DumpToFile(fetch, "src");
330
331 #if defined(_DEBUG)
332 verifyFunction(*fetch);
333 #endif
334
335 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
336
337 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
338 setupPasses.add(createBreakCriticalEdgesPass());
339 setupPasses.add(createCFGSimplificationPass());
340 setupPasses.add(createEarlyCSEPass());
341 setupPasses.add(createPromoteMemoryToRegisterPass());
342
343 setupPasses.run(*fetch);
344
345 JitManager::DumpToFile(fetch, "se");
346
347 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
348
349 ///@todo Haven't touched these either. Need to remove some of these and add others.
350 optPasses.add(createCFGSimplificationPass());
351 optPasses.add(createEarlyCSEPass());
352 optPasses.add(createInstructionCombiningPass());
353 optPasses.add(createInstructionSimplifierPass());
354 optPasses.add(createConstantPropagationPass());
355 optPasses.add(createSCCPPass());
356 optPasses.add(createAggressiveDCEPass());
357
358 optPasses.run(*fetch);
359 optPasses.run(*fetch);
360
361 JitManager::DumpToFile(fetch, "opt");
362
363
364 return fetch;
365 }
366
367 //////////////////////////////////////////////////////////////////////////
368 /// @brief Loads attributes from memory using LOADs, shuffling the
369 /// components into SOA form.
370 /// *Note* currently does not support component control,
371 /// component packing, instancing
372 /// @param fetchState - info about attributes to be fetched from memory
373 /// @param streams - value pointer to the current vertex stream
374 /// @param vIndices - vector value of indices to load
375 /// @param pVtxOut - value pointer to output simdvertex struct
376 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
377 {
378 // Zack shuffles; a variant of the Charleston.
379
380 std::vector<Value*> vectors(16);
381 std::vector<Constant*> pMask(mVWidth);
382 for(uint32_t i = 0; i < mVWidth; ++i)
383 {
384 pMask[i] = (C(i < 4 ? i : 4));
385 }
386 Constant* promoteMask = ConstantVector::get(pMask);
387 Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
388
389 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
390 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
391 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
392 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
393 curInstance->setName("curInstance");
394
395 for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
396 {
397 Value* elements[4] = {0};
398 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
399 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
400 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
401 uint32_t numComponents = info.numComps;
402 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
403
404 // load path doesn't support component packing
405 SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
406
407 vectors.clear();
408
409 if (fetchState.bInstanceIDOffsetEnable)
410 {
411 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
412 }
413
414 Value *vCurIndices;
415 Value *startOffset;
416 if(ied.InstanceEnable)
417 {
418 Value* stepRate = C(ied.InstanceAdvancementState);
419
420 // prevent a div by 0 for 0 step rate
421 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
422 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
423
424 // calc the current offset into instanced data buffer
425 Value* calcInstance = UDIV(curInstance, stepRate);
426
427 // if step rate is 0, every instance gets instance 0
428 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
429
430 vCurIndices = VBROADCAST(calcInstance);
431
432 startOffset = startInstance;
433 }
434 else if (ied.InstanceStrideEnable)
435 {
436 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
437 }
438 else
439 {
440 // offset indices by baseVertex
441 vCurIndices = ADD(vIndices, vBaseVertex);
442
443 startOffset = startVertex;
444 }
445
446 // load SWR_VERTEX_BUFFER_STATE::pData
447 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
448
449 // load SWR_VERTEX_BUFFER_STATE::pitch
450 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
451 stride = Z_EXT(stride, mInt64Ty);
452
453 // load SWR_VERTEX_BUFFER_STATE::size
454 Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
455 size = Z_EXT(size, mInt64Ty);
456
457 Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
458
459 Value *minVertex = NULL;
460 Value *minVertexOffset = NULL;
461 if (fetchState.bPartialVertexBuffer) {
462 // fetch min index for low bounds checking
463 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
464 minVertex = LOAD(minVertex);
465 if (!fetchState.bDisableIndexOOBCheck) {
466 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
467 }
468 }
469
470 // Load from the stream.
471 for(uint32_t lane = 0; lane < mVWidth; ++lane)
472 {
473 // Get index
474 Value* index = VEXTRACT(vCurIndices, C(lane));
475
476 if (fetchState.bPartialVertexBuffer) {
477 // clamp below minvertex
478 Value *isBelowMin = ICMP_SLT(index, minVertex);
479 index = SELECT(isBelowMin, minVertex, index);
480 }
481
482 index = Z_EXT(index, mInt64Ty);
483
484 Value* offset = MUL(index, stride);
485 offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
486 offset = ADD(offset, startVertexOffset);
487
488 if (!fetchState.bDisableIndexOOBCheck) {
489 // check for out of bound access, including partial OOB, and replace them with minVertex
490 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
491 Value *oob = ICMP_ULE(endOffset, size);
492 if (fetchState.bPartialVertexBuffer) {
493 offset = SELECT(oob, offset, minVertexOffset);
494 } else {
495 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
496 }
497 }
498
499 Value* pointer = GEP(stream, offset);
500 // We use a full-lane, but don't actually care.
501 Value* vptr = 0;
502
503 // get a pointer to a 4 component attrib in default address space
504 switch(bpc)
505 {
506 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
507 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
508 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
509 default: SWR_INVALID("Unsupported underlying bpp!");
510 }
511
512 // load 4 components of attribute
513 Value* vec = ALIGNED_LOAD(vptr, 1, false);
514
515 // Convert To FP32 internally
516 switch(info.type[0])
517 {
518 case SWR_TYPE_UNORM:
519 switch(bpc)
520 {
521 case 8:
522 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
523 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
524 break;
525 case 16:
526 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
527 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
528 break;
529 default:
530 SWR_INVALID("Unsupported underlying type!");
531 break;
532 }
533 break;
534 case SWR_TYPE_SNORM:
535 switch(bpc)
536 {
537 case 8:
538 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
539 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
540 break;
541 case 16:
542 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
543 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
544 break;
545 default:
546 SWR_INVALID("Unsupported underlying type!");
547 break;
548 }
549 break;
550 case SWR_TYPE_UINT:
551 // Zero extend uint32_t types.
552 switch(bpc)
553 {
554 case 8:
555 case 16:
556 vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
557 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
558 break;
559 case 32:
560 break; // Pass through unchanged.
561 default:
562 SWR_INVALID("Unsupported underlying type!");
563 break;
564 }
565 break;
566 case SWR_TYPE_SINT:
567 // Sign extend SINT types.
568 switch(bpc)
569 {
570 case 8:
571 case 16:
572 vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
573 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
574 break;
575 case 32:
576 break; // Pass through unchanged.
577 default:
578 SWR_INVALID("Unsupported underlying type!");
579 break;
580 }
581 break;
582 case SWR_TYPE_FLOAT:
583 switch(bpc)
584 {
585 case 32:
586 break; // Pass through unchanged.
587 default:
588 SWR_INVALID("Unsupported underlying type!");
589 }
590 break;
591 case SWR_TYPE_USCALED:
592 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
593 break;
594 case SWR_TYPE_SSCALED:
595 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
596 break;
597 case SWR_TYPE_SFIXED:
598 vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
599 break;
600 case SWR_TYPE_UNKNOWN:
601 case SWR_TYPE_UNUSED:
602 SWR_INVALID("Unsupported type %d!", info.type[0]);
603 }
604
605 // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
606 // uwvec: 4 x F32, undef value
607 Value* wvec = VSHUFFLE(vec, uwvec, promoteMask);
608 vectors.push_back(wvec);
609 }
610
611 std::vector<Constant*> v01Mask(mVWidth);
612 std::vector<Constant*> v23Mask(mVWidth);
613 std::vector<Constant*> v02Mask(mVWidth);
614 std::vector<Constant*> v13Mask(mVWidth);
615
616 // Concatenate the vectors together.
617 elements[0] = VUNDEF_F();
618 elements[1] = VUNDEF_F();
619 elements[2] = VUNDEF_F();
620 elements[3] = VUNDEF_F();
621 for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
622 {
623 v01Mask[4 * b + 0] = C(0 + 4 * b);
624 v01Mask[4 * b + 1] = C(1 + 4 * b);
625 v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
626 v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
627
628 v23Mask[4 * b + 0] = C(2 + 4 * b);
629 v23Mask[4 * b + 1] = C(3 + 4 * b);
630 v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
631 v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
632
633 v02Mask[4 * b + 0] = C(0 + 4 * b);
634 v02Mask[4 * b + 1] = C(2 + 4 * b);
635 v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
636 v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
637
638 v13Mask[4 * b + 0] = C(1 + 4 * b);
639 v13Mask[4 * b + 1] = C(3 + 4 * b);
640 v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
641 v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
642
643 std::vector<Constant*> iMask(mVWidth);
644 for(uint32_t i = 0; i < mVWidth; ++i)
645 {
646 if(((4 * b) <= i) && (i < (4 * (b + 1))))
647 {
648 iMask[i] = C(i % 4 + mVWidth);
649 }
650 else
651 {
652 iMask[i] = C(i);
653 }
654 }
655 Constant* insertMask = ConstantVector::get(iMask);
656 elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
657 elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
658 elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
659 elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
660 }
661
662 Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
663 Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
664 Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
665 Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
666 elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
667 elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
668 elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
669 elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
670
671 switch(numComponents + 1)
672 {
673 case 1: elements[0] = VIMMED1(0.0f);
674 case 2: elements[1] = VIMMED1(0.0f);
675 case 3: elements[2] = VIMMED1(0.0f);
676 case 4: elements[3] = VIMMED1(1.0f);
677 }
678
679 for(uint32_t c = 0; c < 4; ++c)
680 {
681 #if USE_SIMD16_SHADERS
682 Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
683 #else
684 Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
685 #endif
686 STORE(elements[c], dest);
687 }
688 }
689 }
690
691 // returns true for odd formats that require special state.gather handling
692 bool FetchJit::IsOddFormat(SWR_FORMAT format)
693 {
694 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
695 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
696 {
697 return true;
698 }
699 return false;
700 }
701
702 // format is uniform if all components are the same size and type
703 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
704 {
705 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
706 uint32_t bpc0 = info.bpc[0];
707 uint32_t type0 = info.type[0];
708
709 for (uint32_t c = 1; c < info.numComps; ++c)
710 {
711 if (bpc0 != info.bpc[c] || type0 != info.type[c])
712 {
713 return false;
714 }
715 }
716 return true;
717 }
718
719 // unpacks components based on format
720 // foreach component in the pixel
721 // mask off everything but this component
722 // shift component to LSB
723 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
724 {
725 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
726
727 uint32_t bitOffset = 0;
728 for (uint32_t c = 0; c < info.numComps; ++c)
729 {
730 uint32_t swizzledIndex = info.swizzle[c];
731 uint32_t compBits = info.bpc[c];
732 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
733 Value* comp = AND(vInput, bitmask);
734 comp = LSHR(comp, bitOffset);
735
736 result[swizzledIndex] = comp;
737 bitOffset += compBits;
738 }
739 }
740
741 // gather for odd component size formats
742 // gather SIMD full pixels per lane then shift/mask to move each component to their
743 // own vector
744 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
745 {
746 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
747
748 // only works if pixel size is <= 32bits
749 SWR_ASSERT(info.bpp <= 32);
750
751 Value *pGather;
752 if (info.bpp == 32)
753 {
754 pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
755 }
756 else
757 {
758 // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
759 Value *pMem = ALLOCA(mSimdInt32Ty);
760 STORE(VIMMED1(0u), pMem);
761
762 pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
763 Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
764
765 for (uint32_t lane = 0; lane < mVWidth; ++lane)
766 {
767 // Get index
768 Value* index = VEXTRACT(pOffsets, C(lane));
769 Value* mask = VEXTRACT(pMask, C(lane));
770 switch (info.bpp)
771 {
772 case 8:
773 {
774 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
775 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
776 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
777 break;
778 }
779
780 case 16:
781 {
782 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
783 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
784 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
785 break;
786 }
787 break;
788
789 case 24:
790 {
791 // First 16-bits of data
792 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
793 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
794 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
795
796 // Last 8-bits of data
797 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
798 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
799 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
800 break;
801 }
802
803 default:
804 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
805 break;
806 }
807 }
808
809 pGather = LOAD(pMem);
810 }
811
812 for (uint32_t comp = 0; comp < 4; ++comp)
813 {
814 pResult[comp] = VIMMED1((int)info.defaults[comp]);
815 }
816
817 UnpackComponents(format, pGather, pResult);
818
819 // cast to fp32
820 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
821 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
822 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
823 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
824 }
825
826 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
827 {
828 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
829
830 for (uint32_t c = 0; c < info.numComps; ++c)
831 {
832 uint32_t compIndex = info.swizzle[c];
833
834 // skip any conversion on UNUSED components
835 if (info.type[c] == SWR_TYPE_UNUSED)
836 {
837 continue;
838 }
839
840 if (info.isNormalized[c])
841 {
842 if (info.type[c] == SWR_TYPE_SNORM)
843 {
844 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
845
846 /// result = c * (1.0f / (2^(n-1) - 1);
847 uint32_t n = info.bpc[c];
848 uint32_t pow2 = 1 << (n - 1);
849 float scale = 1.0f / (float)(pow2 - 1);
850 Value *vScale = VIMMED1(scale);
851 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
852 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
853 texels[compIndex] = FMUL(texels[compIndex], vScale);
854 }
855 else
856 {
857 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
858
859 /// result = c * (1.0f / (2^n - 1))
860 uint32_t n = info.bpc[c];
861 uint32_t pow2 = 1 << n;
862 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
863 if (n == 24)
864 {
865 float scale = (float)(pow2 - 1);
866 Value* vScale = VIMMED1(scale);
867 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
868 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
869 texels[compIndex] = FDIV(texels[compIndex], vScale);
870 }
871 else
872 {
873 float scale = 1.0f / (float)(pow2 - 1);
874 Value *vScale = VIMMED1(scale);
875 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
876 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
877 texels[compIndex] = FMUL(texels[compIndex], vScale);
878 }
879 }
880 continue;
881 }
882 }
883 }
884
885 //////////////////////////////////////////////////////////////////////////
886 /// @brief Loads attributes from memory using AVX2 GATHER(s)
887 /// @param fetchState - info about attributes to be fetched from memory
888 /// @param streams - value pointer to the current vertex stream
889 /// @param vIndices - vector value of indices to gather
890 /// @param pVtxOut - value pointer to output simdvertex struct
891 #if USE_SIMD16_SHADERS
892 #if USE_SIMD16_GATHERS
893 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
894 Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
895 #else
896 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
897 Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
898 #endif
899 #else
900 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
901 Value* streams, Value* vIndices, Value* pVtxOut)
902 #endif
903 {
904 uint32_t currentVertexElement = 0;
905 uint32_t outputElt = 0;
906 Value* vVertexElements[4];
907 #if USE_SIMD16_GATHERS
908 Value *pVtxSrc2[4];
909 #endif
910
911 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
912 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
913 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
914 #if USE_SIMD16_GATHERS
915 Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
916 #else
917 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
918 #endif
919 curInstance->setName("curInstance");
920
921 for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
922 {
923 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
924
925 // skip element if all components are disabled
926 if (ied.ComponentPacking == ComponentEnable::NONE)
927 {
928 continue;
929 }
930
931 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
932 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
933 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
934
935 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
936
937 // VGATHER* takes an *i8 src pointer
938 Value *pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
939
940 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
941 #if USE_SIMD16_GATHERS
942 Value *vStride16 = VBROADCAST_16(stride);
943 #else
944 Value *vStride = VBROADCAST(stride);
945 #endif
946
947 // max vertex index that is fully in bounds
948 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
949 maxVertex = LOAD(maxVertex);
950
951 Value *minVertex = NULL;
952 if (fetchState.bPartialVertexBuffer)
953 {
954 // min vertex index for low bounds OOB checking
955 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
956 minVertex = LOAD(minVertex);
957 }
958
959 if (fetchState.bInstanceIDOffsetEnable)
960 {
961 // the InstanceID (curInstance) value is offset by StartInstanceLocation
962 curInstance = ADD(curInstance, startInstance);
963 }
964
965 #if USE_SIMD16_GATHERS
966 Value *vCurIndices16;
967 #else
968 Value *vCurIndices;
969 #endif
970 Value *startOffset;
971 #if USE_SIMD16_GATHERS
972 Value *vInstanceStride16 = VIMMED1_16(0);
973 #else
974 Value *vInstanceStride = VIMMED1(0);
975 #endif
976
977 if (ied.InstanceEnable)
978 {
979 Value* stepRate = C(ied.InstanceAdvancementState);
980
981 // prevent a div by 0 for 0 step rate
982 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
983 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
984
985 // calc the current offset into instanced data buffer
986 Value* calcInstance = UDIV(curInstance, stepRate);
987
988 // if step rate is 0, every instance gets instance 0
989 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
990
991 #if USE_SIMD16_GATHERS
992 vCurIndices16 = VBROADCAST_16(calcInstance);
993 #else
994 vCurIndices = VBROADCAST(calcInstance);
995 #endif
996
997 startOffset = startInstance;
998 }
999 else if (ied.InstanceStrideEnable)
1000 {
1001 // grab the instance advancement state, determines stride in bytes from one instance to the next
1002 Value* stepRate = C(ied.InstanceAdvancementState);
1003 #if USE_SIMD16_GATHERS
1004 vInstanceStride16 = VBROADCAST_16(MUL(curInstance, stepRate));
1005 #else
1006 vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
1007 #endif
1008
1009 // offset indices by baseVertex
1010 #if USE_SIMD16_GATHERS
1011 Value *vIndices16 = JOIN_16(vIndices, vIndices2);
1012
1013 vCurIndices16 = ADD(vIndices16, vBaseVertex16);
1014 #else
1015 vCurIndices = ADD(vIndices, vBaseVertex);
1016 #endif
1017
1018 startOffset = startVertex;
1019 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
1020 }
1021 else
1022 {
1023 // offset indices by baseVertex
1024 #if USE_SIMD16_GATHERS
1025 Value *vIndices16 = JOIN_16(vIndices, vIndices2);
1026
1027 vCurIndices16 = ADD(vIndices16, vBaseVertex16);
1028 #else
1029 vCurIndices = ADD(vIndices, vBaseVertex);
1030 #endif
1031
1032 startOffset = startVertex;
1033 }
1034
1035 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
1036 // do 64bit address offset calculations.
1037
1038 // calculate byte offset to the start of the VB
1039 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
1040 pStreamBase = GEP(pStreamBase, baseOffset);
1041
1042 // if we have a start offset, subtract from max vertex. Used for OOB check
1043 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
1044 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
1045 // if we have a negative value, we're already OOB. clamp at 0.
1046 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
1047
1048 if (fetchState.bPartialVertexBuffer)
1049 {
1050 // similary for min vertex
1051 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
1052 Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
1053 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
1054 }
1055
1056 // Load the in bounds size of a partially valid vertex
1057 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
1058 partialInboundsSize = LOAD(partialInboundsSize);
1059 #if USE_SIMD16_GATHERS
1060 Value *vPartialVertexSize = VBROADCAST_16(partialInboundsSize);
1061 Value *vBpp = VBROADCAST_16(C(info.Bpp));
1062 Value *vAlignmentOffsets = VBROADCAST_16(C(ied.AlignedByteOffset));
1063 #else
1064 Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
1065 Value *vBpp = VBROADCAST(C(info.Bpp));
1066 Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
1067 #endif
1068
1069 // is the element is <= the partially valid size
1070 Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
1071
1072 #if USE_SIMD16_GATHERS
1073 // override cur indices with 0 if pitch is 0
1074 Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED1_16(0));
1075 vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED1_16(0), vCurIndices16);
1076
1077 // are vertices partially OOB?
1078 Value *vMaxVertex16 = VBROADCAST_16(maxVertex);
1079 Value *vPartialOOBMask = ICMP_EQ(vCurIndices16, vMaxVertex16);
1080
1081 // are vertices fully in bounds?
1082 Value *vMaxGatherMask16 = ICMP_ULT(vCurIndices16, vMaxVertex16);
1083
1084 Value *vGatherMask16;
1085
1086 if (fetchState.bPartialVertexBuffer)
1087 {
1088 // are vertices below minVertex limit?
1089 Value *vMinVertex16 = VBROADCAST_16(minVertex);
1090 Value *vMinGatherMask16 = ICMP_UGE(vCurIndices16, vMinVertex16);
1091
1092 // only fetch lanes that pass both tests
1093 vGatherMask16 = AND(vMaxGatherMask16, vMinGatherMask16);
1094 }
1095 else
1096 {
1097 vGatherMask16 = vMaxGatherMask16;
1098 }
1099
1100 // blend in any partially OOB indices that have valid elements
1101 vGatherMask16 = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask16);
1102
1103 // calculate the actual offsets into the VB
1104 Value *vOffsets16 = MUL(vCurIndices16, vStride16);
1105 vOffsets16 = ADD(vOffsets16, vAlignmentOffsets);
1106
1107 // if instance stride enable is:
1108 // true - add product of the instanceID and advancement state to the offst into the VB
1109 // false - value of vInstanceStride has been initialialized to zero
1110 vOffsets16 = ADD(vOffsets16, vInstanceStride16);
1111
1112 // TODO: remove the following simd8 interop stuff once all code paths are fully widened to SIMD16..
1113
1114 Value *vGatherMask = EXTRACT_16(vGatherMask16, 0);
1115 Value *vGatherMask2 = EXTRACT_16(vGatherMask16, 1);
1116
1117 Value *vOffsets = EXTRACT_16(vOffsets16, 0);
1118 Value *vOffsets2 = EXTRACT_16(vOffsets16, 1);
1119 #else
1120 // override cur indices with 0 if pitch is 0
1121 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1122 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1123
1124 // are vertices partially OOB?
1125 Value* vMaxVertex = VBROADCAST(maxVertex);
1126 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1127
1128 // are vertices fully in bounds?
1129 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1130
1131 Value *vGatherMask;
1132 if (fetchState.bPartialVertexBuffer)
1133 {
1134 // are vertices below minVertex limit?
1135 Value *vMinVertex = VBROADCAST(minVertex);
1136 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1137
1138 // only fetch lanes that pass both tests
1139 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1140 }
1141 else
1142 {
1143 vGatherMask = vMaxGatherMask;
1144 }
1145
1146 // blend in any partially OOB indices that have valid elements
1147 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1148
1149 // calculate the actual offsets into the VB
1150 Value* vOffsets = MUL(vCurIndices, vStride);
1151 vOffsets = ADD(vOffsets, vAlignmentOffsets);
1152
1153 // if instance stride enable is:
1154 // true - add product of the instanceID and advancement state to the offst into the VB
1155 // false - value of vInstanceStride has been initialialized to zero
1156 vOffsets = ADD(vOffsets, vInstanceStride);
1157
1158 #endif
1159 // Packing and component control
1160 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1161 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1162 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1163
1164 // Special gather/conversion for formats without equal component sizes
1165 if (IsOddFormat((SWR_FORMAT)ied.Format))
1166 {
1167 #if USE_SIMD16_GATHERS
1168 Value *pResults[4];
1169 Value *pResults2[4];
1170 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1171 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1172 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1173 ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1174
1175 for (uint32_t c = 0; c < 4; c += 1)
1176 {
1177 if (isComponentEnabled(compMask, c))
1178 {
1179 // pack adjacent pairs of SIMD8s into SIMD16s
1180 pVtxSrc2[currentVertexElement++] = JOIN_16(pResults[c], pResults2[c]);
1181
1182 if (currentVertexElement > 3)
1183 {
1184 // store SIMD16s
1185 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1186
1187 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1188 // reset to the next vVertexElement to output
1189 currentVertexElement = 0;
1190 }
1191 }
1192 }
1193 #else
1194 Value *pResults[4];
1195 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1196 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1197
1198 for (uint32_t c = 0; c < 4; c += 1)
1199 {
1200 if (isComponentEnabled(compMask, c))
1201 {
1202 vVertexElements[currentVertexElement++] = pResults[c];
1203 if (currentVertexElement > 3)
1204 {
1205 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1206 // reset to the next vVertexElement to output
1207 currentVertexElement = 0;
1208 }
1209 }
1210 }
1211 #endif
1212 }
1213 else if(info.type[0] == SWR_TYPE_FLOAT)
1214 {
1215 ///@todo: support 64 bit vb accesses
1216 Value *gatherSrc = VIMMED1(0.0f);
1217 #if USE_SIMD16_GATHERS
1218 Value *gatherSrc16 = VIMMED1_16(0.0f);
1219 #endif
1220
1221 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1222 "Unsupported format for standard gather fetch.");
1223
1224 // Gather components from memory to store in a simdvertex structure
1225 switch (bpc)
1226 {
1227 case 16:
1228 {
1229 #if USE_SIMD16_GATHERS
1230 Value *gatherResult[2];
1231
1232 // if we have at least one component out of x or y to fetch
1233 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1234 {
1235 gatherResult[0] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1236
1237 // e.g. result of first 8x32bit integer gather for 16bit components
1238 // 256i - 0 1 2 3 4 5 6 7
1239 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1240 //
1241 }
1242 else
1243 {
1244 gatherResult[0] = VUNDEF_I_16();
1245 }
1246
1247 // if we have at least one component out of z or w to fetch
1248 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1249 {
1250 // offset base to the next components(zw) in the vertex to gather
1251 pStreamBase = GEP(pStreamBase, C((char)4));
1252
1253 gatherResult[1] = GATHERPS_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1254
1255 // e.g. result of second 8x32bit integer gather for 16bit components
1256 // 256i - 0 1 2 3 4 5 6 7
1257 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1258 //
1259 }
1260 else
1261 {
1262 gatherResult[1] = VUNDEF_I_16();
1263 }
1264
1265 // if we have at least one component to shuffle into place
1266 if (compMask)
1267 {
1268 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1269
1270 Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
1271 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1272
1273 // Shuffle gathered components into place in simdvertex struct
1274 Shuffle16bpcGather16(args); // outputs to vVertexElements ref
1275 }
1276 #else
1277 Value *vGatherResult[2];
1278
1279 // if we have at least one component out of x or y to fetch
1280 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1281 {
1282 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1283 // e.g. result of first 8x32bit integer gather for 16bit components
1284 // 256i - 0 1 2 3 4 5 6 7
1285 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1286 //
1287 }
1288
1289 // if we have at least one component out of z or w to fetch
1290 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1291 {
1292 // offset base to the next components(zw) in the vertex to gather
1293 pStreamBase = GEP(pStreamBase, C((char)4));
1294
1295 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1296 // e.g. result of second 8x32bit integer gather for 16bit components
1297 // 256i - 0 1 2 3 4 5 6 7
1298 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1299 //
1300 }
1301
1302 // if we have at least one component to shuffle into place
1303 if (compMask)
1304 {
1305 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1306 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1307
1308 // Shuffle gathered components into place in simdvertex struct
1309 #if USE_SIMD16_SHADERS
1310 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
1311 #else
1312 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1313 #endif
1314 }
1315 #endif
1316 }
1317 break;
1318 case 32:
1319 {
1320 for (uint32_t i = 0; i < 4; i += 1)
1321 {
1322 #if USE_SIMD16_GATHERS
1323 if (isComponentEnabled(compMask, i))
1324 {
1325 // if we need to gather the component
1326 if (compCtrl[i] == StoreSrc)
1327 {
1328 // Gather a SIMD of vertices
1329 // APIs allow a 4GB range for offsets
1330 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1331 // But, we know that elements must be aligned for FETCH. :)
1332 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1333 Value *shiftedOffsets16 = LSHR(vOffsets16, 1);
1334 pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets16, vGatherMask16, 2);
1335 }
1336 else
1337 {
1338 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1339 }
1340
1341 if (currentVertexElement > 3)
1342 {
1343 // store SIMD16s
1344 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1345
1346 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1347 // reset to the next vVertexElement to output
1348 currentVertexElement = 0;
1349 }
1350 }
1351
1352 // offset base to the next component in the vertex to gather
1353 pStreamBase = GEP(pStreamBase, C((char)4));
1354 #else
1355 if (isComponentEnabled(compMask, i))
1356 {
1357 // if we need to gather the component
1358 if (compCtrl[i] == StoreSrc)
1359 {
1360 // Gather a SIMD of vertices
1361 // APIs allow a 4GB range for offsets
1362 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1363 // But, we know that elements must be aligned for FETCH. :)
1364 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1365 Value *vShiftedOffsets = LSHR(vOffsets, 1);
1366 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1367 }
1368 else
1369 {
1370 #if USE_SIMD16_SHADERS
1371 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1372 #else
1373 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1374 #endif
1375 }
1376
1377 if (currentVertexElement > 3)
1378 {
1379 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1380 // reset to the next vVertexElement to output
1381 currentVertexElement = 0;
1382 }
1383 }
1384
1385 // offset base to the next component in the vertex to gather
1386 pStreamBase = GEP(pStreamBase, C((char)4));
1387 #endif
1388 }
1389 }
1390 break;
1391 case 64:
1392 {
1393 for (uint32_t i = 0; i < 4; i += 1)
1394 {
1395 #if USE_SIMD16_GATHERS
1396 if (isComponentEnabled(compMask, i))
1397 {
1398 // if we need to gather the component
1399 if (compCtrl[i] == StoreSrc)
1400 {
1401 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1402 Value *vMaskLo2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1403 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1404 Value *vMaskHi2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1405
1406 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1407 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1408 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1409 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1410
1411 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1412
1413 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1414 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1415 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1416 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1417
1418 pGatherLo = VCVTPD2PS(pGatherLo);
1419 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1420 pGatherHi = VCVTPD2PS(pGatherHi);
1421 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1422
1423 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1424 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1425
1426 // pack adjacent pairs of SIMD8s into SIMD16s
1427 pVtxSrc2[currentVertexElement++] = JOIN_16(pGather, pGather2);
1428 }
1429 else
1430 {
1431 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1432 }
1433
1434 if (currentVertexElement > 3)
1435 {
1436 // store SIMD16s
1437 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1438
1439 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1440 // reset to the next vVertexElement to output
1441 currentVertexElement = 0;
1442 }
1443 }
1444
1445 // offset base to the next component in the vertex to gather
1446 pStreamBase = GEP(pStreamBase, C((char)8));
1447 #else
1448 if (isComponentEnabled(compMask, i))
1449 {
1450 // if we need to gather the component
1451 if (compCtrl[i] == StoreSrc)
1452 {
1453 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1454 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1455
1456 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1457 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1458
1459 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1460
1461 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1462 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1463
1464 pGatherLo = VCVTPD2PS(pGatherLo);
1465 pGatherHi = VCVTPD2PS(pGatherHi);
1466
1467 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1468
1469 vVertexElements[currentVertexElement++] = pGather;
1470 }
1471 else
1472 {
1473 #if USE_SIMD16_SHADERS
1474 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1475 #else
1476 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1477 #endif
1478 }
1479
1480 if (currentVertexElement > 3)
1481 {
1482 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1483 // reset to the next vVertexElement to output
1484 currentVertexElement = 0;
1485 }
1486 }
1487
1488 // offset base to the next component in the vertex to gather
1489 pStreamBase = GEP(pStreamBase, C((char)8));
1490 #endif
1491 }
1492 }
1493 break;
1494 default:
1495 SWR_INVALID("Tried to fetch invalid FP format");
1496 break;
1497 }
1498 }
1499 else
1500 {
1501 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1502 ConversionType conversionType = CONVERT_NONE;
1503
1504 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1505 "Unsupported format for standard gather fetch.");
1506
1507 switch(info.type[0])
1508 {
1509 case SWR_TYPE_UNORM:
1510 conversionType = CONVERT_NORMALIZED;
1511 case SWR_TYPE_UINT:
1512 extendCastType = Instruction::CastOps::ZExt;
1513 break;
1514 case SWR_TYPE_SNORM:
1515 conversionType = CONVERT_NORMALIZED;
1516 case SWR_TYPE_SINT:
1517 extendCastType = Instruction::CastOps::SExt;
1518 break;
1519 case SWR_TYPE_USCALED:
1520 conversionType = CONVERT_USCALED;
1521 extendCastType = Instruction::CastOps::UIToFP;
1522 break;
1523 case SWR_TYPE_SSCALED:
1524 conversionType = CONVERT_SSCALED;
1525 extendCastType = Instruction::CastOps::SIToFP;
1526 break;
1527 case SWR_TYPE_SFIXED:
1528 conversionType = CONVERT_SFIXED;
1529 extendCastType = Instruction::CastOps::SExt;
1530 break;
1531 default:
1532 break;
1533 }
1534
1535 // value substituted when component of gather is masked
1536 Value* gatherSrc = VIMMED1(0);
1537 #if USE_SIMD16_GATHERS
1538 Value *gatherSrc16 = VIMMED1_16(0);
1539 #endif
1540
1541 // Gather components from memory to store in a simdvertex structure
1542 switch (bpc)
1543 {
1544 case 8:
1545 {
1546 // if we have at least one component to fetch
1547 if (compMask)
1548 {
1549 #if USE_SIMD16_GATHERS
1550 Value *gatherResult = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1551
1552 // e.g. result of an 8x32bit integer gather for 8bit components
1553 // 256i - 0 1 2 3 4 5 6 7
1554 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1555
1556 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1557
1558 Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1559 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
1560
1561 // Shuffle gathered components into place in simdvertex struct
1562 Shuffle8bpcGatherd16(args); // outputs to vVertexElements ref
1563 #else
1564 Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1565 // e.g. result of an 8x32bit integer gather for 8bit components
1566 // 256i - 0 1 2 3 4 5 6 7
1567 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1568
1569 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1570 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1571
1572 // Shuffle gathered components into place in simdvertex struct
1573 #if USE_SIMD16_SHADERS
1574 Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1575 #else
1576 Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1577 #endif
1578 #endif
1579 }
1580 }
1581 break;
1582 case 16:
1583 {
1584 #if USE_SIMD16_GATHERS
1585 Value *gatherResult[2];
1586
1587 // if we have at least one component out of x or y to fetch
1588 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1589 {
1590 gatherResult[0] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1591
1592 // e.g. result of first 8x32bit integer gather for 16bit components
1593 // 256i - 0 1 2 3 4 5 6 7
1594 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1595 //
1596 }
1597 else
1598 {
1599 gatherResult[0] = VUNDEF_I_16();
1600 }
1601
1602 // if we have at least one component out of z or w to fetch
1603 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1604 {
1605 // offset base to the next components(zw) in the vertex to gather
1606 pStreamBase = GEP(pStreamBase, C((char)4));
1607
1608 gatherResult[1] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1609
1610 // e.g. result of second 8x32bit integer gather for 16bit components
1611 // 256i - 0 1 2 3 4 5 6 7
1612 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1613 //
1614 }
1615 else
1616 {
1617 gatherResult[1] = VUNDEF_I_16();
1618 }
1619
1620 // if we have at least one component to shuffle into place
1621 if (compMask)
1622 {
1623 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1624
1625 Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1626 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1627
1628 // Shuffle gathered components into place in simdvertex struct
1629 Shuffle16bpcGather16(args); // outputs to vVertexElements ref
1630 }
1631 #else
1632 Value *vGatherResult[2];
1633
1634 // if we have at least one component out of x or y to fetch
1635 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1636 {
1637 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1638 // e.g. result of first 8x32bit integer gather for 16bit components
1639 // 256i - 0 1 2 3 4 5 6 7
1640 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1641 //
1642 }
1643
1644 // if we have at least one component out of z or w to fetch
1645 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1646 {
1647 // offset base to the next components(zw) in the vertex to gather
1648 pStreamBase = GEP(pStreamBase, C((char)4));
1649
1650 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1651 // e.g. result of second 8x32bit integer gather for 16bit components
1652 // 256i - 0 1 2 3 4 5 6 7
1653 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1654 //
1655 }
1656
1657 // if we have at least one component to shuffle into place
1658 if (compMask)
1659 {
1660 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1661 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1662
1663 // Shuffle gathered components into place in simdvertex struct
1664 #if USE_SIMD16_SHADERS
1665 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
1666 #else
1667 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1668 #endif
1669 }
1670 #endif
1671 }
1672 break;
1673 case 32:
1674 {
1675 // Gathered components into place in simdvertex struct
1676 for (uint32_t i = 0; i < 4; i++)
1677 {
1678 if (isComponentEnabled(compMask, i))
1679 {
1680 // if we need to gather the component
1681 if (compCtrl[i] == StoreSrc)
1682 {
1683 #if USE_SIMD16_GATHERS
1684 Value *pGather = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16);
1685
1686 if (conversionType == CONVERT_USCALED)
1687 {
1688 pGather = UI_TO_FP(pGather, mSimd16FP32Ty);
1689 }
1690 else if (conversionType == CONVERT_SSCALED)
1691 {
1692 pGather = SI_TO_FP(pGather, mSimd16FP32Ty);
1693 }
1694 else if (conversionType == CONVERT_SFIXED)
1695 {
1696 pGather = FMUL(SI_TO_FP(pGather, mSimd16FP32Ty), VBROADCAST_16(C(1 / 65536.0f)));
1697 }
1698
1699 pVtxSrc2[currentVertexElement++] = pGather;
1700
1701 // e.g. result of a single 8x32bit integer gather for 32bit components
1702 // 256i - 0 1 2 3 4 5 6 7
1703 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1704 #else
1705 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1706
1707 if (conversionType == CONVERT_USCALED)
1708 {
1709 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1710 }
1711 else if (conversionType == CONVERT_SSCALED)
1712 {
1713 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1714 }
1715 else if (conversionType == CONVERT_SFIXED)
1716 {
1717 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1718 }
1719
1720 vVertexElements[currentVertexElement++] = pGather;
1721
1722 // e.g. result of a single 8x32bit integer gather for 32bit components
1723 // 256i - 0 1 2 3 4 5 6 7
1724 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1725 #endif
1726 }
1727 else
1728 {
1729 #if USE_SIMD16_GATHERS
1730 pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
1731 #else
1732 #if USE_SIMD16_SHADERS
1733 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1734 #else
1735 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1736 #endif
1737 #endif
1738 }
1739
1740 if (currentVertexElement > 3)
1741 {
1742 #if USE_SIMD16_GATHERS
1743 // store SIMD16s
1744 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1745
1746 StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2);
1747 #else
1748 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1749 #endif
1750
1751 // reset to the next vVertexElement to output
1752 currentVertexElement = 0;
1753 }
1754
1755 }
1756
1757 // offset base to the next component in the vertex to gather
1758 pStreamBase = GEP(pStreamBase, C((char)4));
1759 }
1760 }
1761 break;
1762 }
1763 }
1764 }
1765
1766 // if we have a partially filled vVertexElement struct, output it
1767 if (currentVertexElement > 0)
1768 {
1769 #if USE_SIMD16_GATHERS
1770 // store SIMD16s
1771 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
1772
1773 StoreVertexElements16(pVtxOut2, outputElt++, currentVertexElement, pVtxSrc2);
1774 #else
1775 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1776 #endif
1777 }
1778 }
1779
1780 //////////////////////////////////////////////////////////////////////////
1781 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1782 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1783 /// support
1784 /// @param pIndices - pointer to 8 bit indices
1785 /// @param pLastIndex - pointer to last valid index
1786 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1787 {
1788 // can fit 2 16 bit integers per vWidth lane
1789 Value* vIndices = VUNDEF_I();
1790
1791 // store 0 index on stack to be used to conditionally load from if index address is OOB
1792 Value* pZeroIndex = ALLOCA(mInt8Ty);
1793 STORE(C((uint8_t)0), pZeroIndex);
1794
1795 // Load a SIMD of index pointers
1796 for(int64_t lane = 0; lane < mVWidth; lane++)
1797 {
1798 // Calculate the address of the requested index
1799 Value *pIndex = GEP(pIndices, C(lane));
1800
1801 // check if the address is less than the max index,
1802 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1803
1804 // if valid, load the index. if not, load 0 from the stack
1805 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1806 Value *index = LOAD(pValid, "valid index");
1807
1808 // zero extended index to 32 bits and insert into the correct simd lane
1809 index = Z_EXT(index, mInt32Ty);
1810 vIndices = VINSERT(vIndices, index, lane);
1811 }
1812 return vIndices;
1813 }
1814
1815 //////////////////////////////////////////////////////////////////////////
1816 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1817 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1818 /// support
1819 /// @param pIndices - pointer to 16 bit indices
1820 /// @param pLastIndex - pointer to last valid index
1821 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1822 {
1823 // can fit 2 16 bit integers per vWidth lane
1824 Value* vIndices = VUNDEF_I();
1825
1826 // store 0 index on stack to be used to conditionally load from if index address is OOB
1827 Value* pZeroIndex = ALLOCA(mInt16Ty);
1828 STORE(C((uint16_t)0), pZeroIndex);
1829
1830 // Load a SIMD of index pointers
1831 for(int64_t lane = 0; lane < mVWidth; lane++)
1832 {
1833 // Calculate the address of the requested index
1834 Value *pIndex = GEP(pIndices, C(lane));
1835
1836 // check if the address is less than the max index,
1837 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1838
1839 // if valid, load the index. if not, load 0 from the stack
1840 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1841 Value *index = LOAD(pValid, "valid index");
1842
1843 // zero extended index to 32 bits and insert into the correct simd lane
1844 index = Z_EXT(index, mInt32Ty);
1845 vIndices = VINSERT(vIndices, index, lane);
1846 }
1847 return vIndices;
1848 }
1849
1850 //////////////////////////////////////////////////////////////////////////
1851 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1852 /// @param pIndices - pointer to 32 bit indices
1853 /// @param pLastIndex - pointer to last valid index
1854 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1855 {
1856 DataLayout dL(JM()->mpCurrentModule);
1857 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1858 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1859 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1860
1861 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1862 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1863 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1864 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1865
1866 // create a vector of index counts from the base index ptr passed into the fetch
1867 const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1868 Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1869
1870 // compare index count to the max valid index
1871 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1872 // vIndexOffsets 0 1 2 3 4 5 6 7
1873 // ------------------------------
1874 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1875 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1876 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1877 Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1878
1879 // VMASKLOAD takes an *i8 src pointer
1880 pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1881
1882 // Load the indices; OOB loads 0
1883 return MASKLOADD(pIndices,vIndexMask);
1884 }
1885
1886 //////////////////////////////////////////////////////////////////////////
1887 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1888 /// denormalizes if needed, converts to F32 if needed, and positions in
1889 // the proper SIMD rows to be output to the simdvertex structure
1890 /// @param args: (tuple of args, listed below)
1891 /// @param vGatherResult - 8 gathered 8bpc vertices
1892 /// @param pVtxOut - base pointer to output simdvertex struct
1893 /// @param extendType - sign extend or zero extend
1894 /// @param bNormalized - do we need to denormalize?
1895 /// @param currentVertexElement - reference to the current vVertexElement
1896 /// @param outputElt - reference to the current offset from simdvertex we're o
1897 /// @param compMask - component packing mask
1898 /// @param compCtrl - component control val
1899 /// @param vVertexElements[4] - vertex components to output
1900 /// @param swizzle[4] - component swizzle location
1901 #if USE_SIMD16_GATHERS
1902 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
1903 {
1904 // Unpack tuple args
1905 Value*& vGatherResult = std::get<0>(args);
1906 Value* pVtxOut = std::get<1>(args);
1907 const Instruction::CastOps extendType = std::get<2>(args);
1908 const ConversionType conversionType = std::get<3>(args);
1909 uint32_t &currentVertexElement = std::get<4>(args);
1910 uint32_t &outputElt = std::get<5>(args);
1911 const ComponentEnable compMask = std::get<6>(args);
1912 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1913 Value* (&vVertexElements)[4] = std::get<8>(args);
1914 const uint32_t(&swizzle)[4] = std::get<9>(args);
1915
1916 // cast types
1917 Type *vGatherTy = mSimdInt32Ty;
1918 Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1919
1920 // have to do extra work for sign extending
1921 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1922 {
1923 Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1924 Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1925
1926 // shuffle mask, including any swizzling
1927 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1928 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1929 Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
1930 char(y), char(y + 4), char(y + 8), char(y + 12),
1931 char(z), char(z + 4), char(z + 8), char(z + 12),
1932 char(w), char(w + 4), char(w + 8), char(w + 12),
1933 char(x), char(x + 4), char(x + 8), char(x + 12),
1934 char(y), char(y + 4), char(y + 8), char(y + 12),
1935 char(z), char(z + 4), char(z + 8), char(z + 12),
1936 char(w), char(w + 4), char(w + 8), char(w + 12) });
1937
1938 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1939
1940 Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1941 Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1942
1943 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1944 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1945
1946 // after pshufb: group components together in each 128bit lane
1947 // 256i - 0 1 2 3 4 5 6 7
1948 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1949
1950 Value *vi128XY_lo = nullptr;
1951 Value *vi128XY_hi = nullptr;
1952 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1953 {
1954 vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1955 vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
1956
1957 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1958 // 256i - 0 1 2 3 4 5 6 7
1959 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1960 }
1961
1962 // do the same for zw components
1963 Value *vi128ZW_lo = nullptr;
1964 Value *vi128ZW_hi = nullptr;
1965 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1966 {
1967 vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1968 vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
1969 }
1970
1971 // init denormalize variables if needed
1972 Instruction::CastOps fpCast;
1973 Value *conversionFactor;
1974
1975 switch (conversionType)
1976 {
1977 case CONVERT_NORMALIZED:
1978 fpCast = Instruction::CastOps::SIToFP;
1979 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1980 break;
1981 case CONVERT_SSCALED:
1982 fpCast = Instruction::CastOps::SIToFP;
1983 conversionFactor = VIMMED1((float)(1.0));
1984 break;
1985 case CONVERT_USCALED:
1986 SWR_INVALID("Type should not be sign extended!");
1987 conversionFactor = nullptr;
1988 break;
1989 default:
1990 SWR_ASSERT(conversionType == CONVERT_NONE);
1991 conversionFactor = nullptr;
1992 break;
1993 }
1994
1995 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1996 for (uint32_t i = 0; i < 4; i++)
1997 {
1998 if (isComponentEnabled(compMask, i))
1999 {
2000 if (compCtrl[i] == ComponentControl::StoreSrc)
2001 {
2002 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2003 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2004 // if x or y, use vi128XY permute result, else use vi128ZW
2005 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2006 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2007
2008 // sign extend
2009 Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
2010 Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
2011
2012 // denormalize if needed
2013 if (conversionType != CONVERT_NONE)
2014 {
2015 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2016 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2017 }
2018
2019 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2020
2021 currentVertexElement += 1;
2022 }
2023 else
2024 {
2025 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2026 }
2027
2028 if (currentVertexElement > 3)
2029 {
2030 StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2031 // reset to the next vVertexElement to output
2032 currentVertexElement = 0;
2033 }
2034 }
2035 }
2036 }
2037 // else zero extend
2038 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2039 {
2040 // init denormalize variables if needed
2041 Instruction::CastOps fpCast;
2042 Value *conversionFactor;
2043
2044 switch (conversionType)
2045 {
2046 case CONVERT_NORMALIZED:
2047 fpCast = Instruction::CastOps::UIToFP;
2048 conversionFactor = VIMMED1((float)(1.0 / 255.0));
2049 break;
2050 case CONVERT_USCALED:
2051 fpCast = Instruction::CastOps::UIToFP;
2052 conversionFactor = VIMMED1((float)(1.0));
2053 break;
2054 case CONVERT_SSCALED:
2055 SWR_INVALID("Type should not be zero extended!");
2056 conversionFactor = nullptr;
2057 break;
2058 default:
2059 SWR_ASSERT(conversionType == CONVERT_NONE);
2060 conversionFactor = nullptr;
2061 break;
2062 }
2063
2064 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2065 for (uint32_t i = 0; i < 4; i++)
2066 {
2067 if (isComponentEnabled(compMask, i))
2068 {
2069 if (compCtrl[i] == ComponentControl::StoreSrc)
2070 {
2071 // pshufb masks for each component
2072 Value *vConstMask;
2073 switch (swizzle[i])
2074 {
2075 case 0:
2076 // x shuffle mask
2077 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2078 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2079 break;
2080 case 1:
2081 // y shuffle mask
2082 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2083 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2084 break;
2085 case 2:
2086 // z shuffle mask
2087 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2088 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2089 break;
2090 case 3:
2091 // w shuffle mask
2092 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2093 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2094 break;
2095 default:
2096 vConstMask = nullptr;
2097 break;
2098 }
2099
2100 Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
2101 Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
2102
2103 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2104 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2105
2106 // after pshufb for x channel
2107 // 256i - 0 1 2 3 4 5 6 7
2108 // x000 x000 x000 x000 x000 x000 x000 x000
2109
2110 // denormalize if needed
2111 if (conversionType != CONVERT_NONE)
2112 {
2113 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2114 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2115 }
2116
2117 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2118
2119 currentVertexElement += 1;
2120 }
2121 else
2122 {
2123 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2124 }
2125
2126 if (currentVertexElement > 3)
2127 {
2128 StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2129 // reset to the next vVertexElement to output
2130 currentVertexElement = 0;
2131 }
2132 }
2133 }
2134 }
2135 else
2136 {
2137 SWR_INVALID("Unsupported conversion type");
2138 }
2139 }
2140
2141 #else
2142 #if USE_SIMD16_SHADERS
2143 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
2144 #else
2145 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
2146 #endif
2147 {
2148 // Unpack tuple args
2149 Value*& vGatherResult = std::get<0>(args);
2150 Value* pVtxOut = std::get<1>(args);
2151 const Instruction::CastOps extendType = std::get<2>(args);
2152 const ConversionType conversionType = std::get<3>(args);
2153 uint32_t &currentVertexElement = std::get<4>(args);
2154 uint32_t &outputElt = std::get<5>(args);
2155 const ComponentEnable compMask = std::get<6>(args);
2156 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2157 Value* (&vVertexElements)[4] = std::get<8>(args);
2158 const uint32_t(&swizzle)[4] = std::get<9>(args);
2159
2160 // cast types
2161 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2162
2163 for (uint32_t i = 0; i < 4; i++)
2164 {
2165 if (!isComponentEnabled(compMask, i))
2166 continue;
2167
2168 if (compCtrl[i] == ComponentControl::StoreSrc)
2169 {
2170 std::vector<uint32_t> vShuffleMasks[4] = {
2171 { 0, 4, 8, 12, 16, 20, 24, 28 }, // x
2172 { 1, 5, 9, 13, 17, 21, 25, 29 }, // y
2173 { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
2174 { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
2175 };
2176
2177 Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
2178 UndefValue::get(v32x8Ty),
2179 vShuffleMasks[swizzle[i]]);
2180
2181 if ((extendType == Instruction::CastOps::SExt) ||
2182 (extendType == Instruction::CastOps::SIToFP)) {
2183 switch (conversionType)
2184 {
2185 case CONVERT_NORMALIZED:
2186 val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
2187 break;
2188 case CONVERT_SSCALED:
2189 val = SI_TO_FP(val, mSimdFP32Ty);
2190 break;
2191 case CONVERT_USCALED:
2192 SWR_INVALID("Type should not be sign extended!");
2193 break;
2194 default:
2195 SWR_ASSERT(conversionType == CONVERT_NONE);
2196 val = S_EXT(val, mSimdInt32Ty);
2197 break;
2198 }
2199 }
2200 else if ((extendType == Instruction::CastOps::ZExt) ||
2201 (extendType == Instruction::CastOps::UIToFP)) {
2202 switch (conversionType)
2203 {
2204 case CONVERT_NORMALIZED:
2205 val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
2206 break;
2207 case CONVERT_SSCALED:
2208 SWR_INVALID("Type should not be zero extended!");
2209 break;
2210 case CONVERT_USCALED:
2211 val = UI_TO_FP(val, mSimdFP32Ty);
2212 break;
2213 default:
2214 SWR_ASSERT(conversionType == CONVERT_NONE);
2215 val = Z_EXT(val, mSimdInt32Ty);
2216 break;
2217 }
2218 }
2219 else
2220 {
2221 SWR_INVALID("Unsupported conversion type");
2222 }
2223
2224 vVertexElements[currentVertexElement++] = val;
2225 }
2226 else
2227 {
2228 #if USE_SIMD16_SHADERS
2229 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2230 #else
2231 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2232 #endif
2233 }
2234
2235 if (currentVertexElement > 3)
2236 {
2237 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2238 // reset to the next vVertexElement to output
2239 currentVertexElement = 0;
2240 }
2241 }
2242 }
2243
2244 #endif
2245 //////////////////////////////////////////////////////////////////////////
2246 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2247 /// denormalizes if needed, converts to F32 if needed, and positions in
2248 // the proper SIMD rows to be output to the simdvertex structure
2249 /// @param args: (tuple of args, listed below)
2250 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2251 /// @param pVtxOut - base pointer to output simdvertex struct
2252 /// @param extendType - sign extend or zero extend
2253 /// @param bNormalized - do we need to denormalize?
2254 /// @param currentVertexElement - reference to the current vVertexElement
2255 /// @param outputElt - reference to the current offset from simdvertex we're o
2256 /// @param compMask - component packing mask
2257 /// @param compCtrl - component control val
2258 /// @param vVertexElements[4] - vertex components to output
2259 #if USE_SIMD16_GATHERS
2260 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
2261 {
2262 // Unpack tuple args
2263 Value* (&vGatherResult)[2] = std::get<0>(args);
2264 Value* pVtxOut = std::get<1>(args);
2265 const Instruction::CastOps extendType = std::get<2>(args);
2266 const ConversionType conversionType = std::get<3>(args);
2267 uint32_t &currentVertexElement = std::get<4>(args);
2268 uint32_t &outputElt = std::get<5>(args);
2269 const ComponentEnable compMask = std::get<6>(args);
2270 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2271 Value* (&vVertexElements)[4] = std::get<8>(args);
2272
2273 // cast types
2274 Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2275 Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2276
2277 // have to do extra work for sign extending
2278 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
2279 {
2280 // is this PP float?
2281 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2282
2283 Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2284 Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2285
2286 // shuffle mask
2287 Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2288 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2289 Value *vi128XY_lo = nullptr;
2290 Value *vi128XY_hi = nullptr;
2291 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2292 {
2293 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2294
2295 Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0);
2296 Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1);
2297
2298 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2299 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2300
2301 // after pshufb: group components together in each 128bit lane
2302 // 256i - 0 1 2 3 4 5 6 7
2303 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2304
2305 vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2306 vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2307
2308 // after PERMD: move and pack xy components into each 128bit lane
2309 // 256i - 0 1 2 3 4 5 6 7
2310 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2311 }
2312
2313 // do the same for zw components
2314 Value *vi128ZW_lo = nullptr;
2315 Value *vi128ZW_hi = nullptr;
2316 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2317 {
2318 Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0);
2319 Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1);
2320
2321 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2322 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2323
2324 vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2325 vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2326 }
2327
2328 // init denormalize variables if needed
2329 Instruction::CastOps IntToFpCast;
2330 Value *conversionFactor;
2331
2332 switch (conversionType)
2333 {
2334 case CONVERT_NORMALIZED:
2335 IntToFpCast = Instruction::CastOps::SIToFP;
2336 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2337 break;
2338 case CONVERT_SSCALED:
2339 IntToFpCast = Instruction::CastOps::SIToFP;
2340 conversionFactor = VIMMED1((float)(1.0));
2341 break;
2342 case CONVERT_USCALED:
2343 SWR_INVALID("Type should not be sign extended!");
2344 conversionFactor = nullptr;
2345 break;
2346 default:
2347 SWR_ASSERT(conversionType == CONVERT_NONE);
2348 conversionFactor = nullptr;
2349 break;
2350 }
2351
2352 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2353 for (uint32_t i = 0; i < 4; i++)
2354 {
2355 if (isComponentEnabled(compMask, i))
2356 {
2357 if (compCtrl[i] == ComponentControl::StoreSrc)
2358 {
2359 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2360 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2361 // if x or y, use vi128XY permute result, else use vi128ZW
2362 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2363 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2364
2365 if (bFP)
2366 {
2367 // extract 128 bit lanes to sign extend each component
2368 Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2369 Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2370
2371 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2372 }
2373 else
2374 {
2375 // extract 128 bit lanes to sign extend each component
2376 Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2377 Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2378
2379 // denormalize if needed
2380 if (conversionType != CONVERT_NONE)
2381 {
2382 temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2383 temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2384 }
2385
2386 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2387 }
2388
2389 currentVertexElement += 1;
2390 }
2391 else
2392 {
2393 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2394 }
2395
2396 if (currentVertexElement > 3)
2397 {
2398 StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2399 // reset to the next vVertexElement to output
2400 currentVertexElement = 0;
2401 }
2402 }
2403 }
2404 }
2405 // else zero extend
2406 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2407 {
2408 // pshufb masks for each component
2409 Value *vConstMask[2];
2410
2411 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2412 {
2413 // x/z shuffle mask
2414 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2415 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2416 }
2417
2418 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2419 {
2420 // y/w shuffle mask
2421 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2422 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2423 }
2424
2425 // init denormalize variables if needed
2426 Instruction::CastOps fpCast;
2427 Value* conversionFactor;
2428
2429 switch (conversionType)
2430 {
2431 case CONVERT_NORMALIZED:
2432 fpCast = Instruction::CastOps::UIToFP;
2433 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2434 break;
2435 case CONVERT_USCALED:
2436 fpCast = Instruction::CastOps::UIToFP;
2437 conversionFactor = VIMMED1((float)(1.0f));
2438 break;
2439 case CONVERT_SSCALED:
2440 SWR_INVALID("Type should not be zero extended!");
2441 conversionFactor = nullptr;
2442 break;
2443 default:
2444 SWR_ASSERT(conversionType == CONVERT_NONE);
2445 conversionFactor = nullptr;
2446 break;
2447 }
2448
2449 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2450 for (uint32_t i = 0; i < 4; i++)
2451 {
2452 if (isComponentEnabled(compMask, i))
2453 {
2454 if (compCtrl[i] == ComponentControl::StoreSrc)
2455 {
2456 // select correct constMask for x/z or y/w pshufb
2457 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2458 // if x or y, use vi128XY permute result, else use vi128ZW
2459 uint32_t selectedGather = (i < 2) ? 0 : 1;
2460
2461 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2462
2463 Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
2464 Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
2465
2466 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2467 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2468
2469 // after pshufb mask for x channel; z uses the same shuffle from the second gather
2470 // 256i - 0 1 2 3 4 5 6 7
2471 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2472
2473 // denormalize if needed
2474 if (conversionType != CONVERT_NONE)
2475 {
2476 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2477 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2478 }
2479
2480 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
2481
2482 currentVertexElement += 1;
2483 }
2484 else
2485 {
2486 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]);
2487 }
2488
2489 if (currentVertexElement > 3)
2490 {
2491 StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements);
2492 // reset to the next vVertexElement to output
2493 currentVertexElement = 0;
2494 }
2495 }
2496 }
2497 }
2498 else
2499 {
2500 SWR_INVALID("Unsupported conversion type");
2501 }
2502 }
2503
2504 #else
2505 #if USE_SIMD16_SHADERS
2506 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2507 #else
2508 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2509 #endif
2510 {
2511 // Unpack tuple args
2512 Value* (&vGatherResult)[2] = std::get<0>(args);
2513 Value* pVtxOut = std::get<1>(args);
2514 const Instruction::CastOps extendType = std::get<2>(args);
2515 const ConversionType conversionType = std::get<3>(args);
2516 uint32_t &currentVertexElement = std::get<4>(args);
2517 uint32_t &outputElt = std::get<5>(args);
2518 const ComponentEnable compMask = std::get<6>(args);
2519 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2520 Value* (&vVertexElements)[4] = std::get<8>(args);
2521
2522 // cast types
2523 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2524 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2525
2526 // have to do extra work for sign extending
2527 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
2528 (extendType == Instruction::CastOps::FPExt))
2529 {
2530 // is this PP float?
2531 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2532
2533 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2534 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2535
2536 // shuffle mask
2537 Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2538 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2539 Value* vi128XY = nullptr;
2540 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
2541 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2542 // after pshufb: group components together in each 128bit lane
2543 // 256i - 0 1 2 3 4 5 6 7
2544 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2545
2546 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2547 // after PERMD: move and pack xy components into each 128bit lane
2548 // 256i - 0 1 2 3 4 5 6 7
2549 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2550 }
2551
2552 // do the same for zw components
2553 Value* vi128ZW = nullptr;
2554 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
2555 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2556 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2557 }
2558
2559 // init denormalize variables if needed
2560 Instruction::CastOps IntToFpCast;
2561 Value* conversionFactor;
2562
2563 switch (conversionType)
2564 {
2565 case CONVERT_NORMALIZED:
2566 IntToFpCast = Instruction::CastOps::SIToFP;
2567 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2568 break;
2569 case CONVERT_SSCALED:
2570 IntToFpCast = Instruction::CastOps::SIToFP;
2571 conversionFactor = VIMMED1((float)(1.0));
2572 break;
2573 case CONVERT_USCALED:
2574 SWR_INVALID("Type should not be sign extended!");
2575 conversionFactor = nullptr;
2576 break;
2577 default:
2578 SWR_ASSERT(conversionType == CONVERT_NONE);
2579 conversionFactor = nullptr;
2580 break;
2581 }
2582
2583 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2584 for (uint32_t i = 0; i < 4; i++)
2585 {
2586 if (isComponentEnabled(compMask, i))
2587 {
2588 if (compCtrl[i] == ComponentControl::StoreSrc)
2589 {
2590 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2591 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2592 // if x or y, use vi128XY permute result, else use vi128ZW
2593 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2594
2595 if (bFP) {
2596 // extract 128 bit lanes to sign extend each component
2597 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2598 }
2599 else {
2600 // extract 128 bit lanes to sign extend each component
2601 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2602
2603 // denormalize if needed
2604 if (conversionType != CONVERT_NONE) {
2605 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2606 }
2607 }
2608 currentVertexElement++;
2609 }
2610 else
2611 {
2612 #if USE_SIMD16_SHADERS
2613 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2614 #else
2615 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2616 #endif
2617 }
2618
2619 if (currentVertexElement > 3)
2620 {
2621 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2622 // reset to the next vVertexElement to output
2623 currentVertexElement = 0;
2624 }
2625 }
2626 }
2627 }
2628 // else zero extend
2629 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2630 {
2631 // pshufb masks for each component
2632 Value* vConstMask[2];
2633 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
2634 // x/z shuffle mask
2635 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2636 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2637 }
2638
2639 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
2640 // y/w shuffle mask
2641 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2642 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2643 }
2644
2645 // init denormalize variables if needed
2646 Instruction::CastOps fpCast;
2647 Value* conversionFactor;
2648
2649 switch (conversionType)
2650 {
2651 case CONVERT_NORMALIZED:
2652 fpCast = Instruction::CastOps::UIToFP;
2653 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2654 break;
2655 case CONVERT_USCALED:
2656 fpCast = Instruction::CastOps::UIToFP;
2657 conversionFactor = VIMMED1((float)(1.0f));
2658 break;
2659 case CONVERT_SSCALED:
2660 SWR_INVALID("Type should not be zero extended!");
2661 conversionFactor = nullptr;
2662 break;
2663 default:
2664 SWR_ASSERT(conversionType == CONVERT_NONE);
2665 conversionFactor = nullptr;
2666 break;
2667 }
2668
2669 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2670 for (uint32_t i = 0; i < 4; i++)
2671 {
2672 if (isComponentEnabled(compMask, i))
2673 {
2674 if (compCtrl[i] == ComponentControl::StoreSrc)
2675 {
2676 // select correct constMask for x/z or y/w pshufb
2677 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2678 // if x or y, use vi128XY permute result, else use vi128ZW
2679 uint32_t selectedGather = (i < 2) ? 0 : 1;
2680
2681 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2682 // after pshufb mask for x channel; z uses the same shuffle from the second gather
2683 // 256i - 0 1 2 3 4 5 6 7
2684 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2685
2686 // denormalize if needed
2687 if (conversionType != CONVERT_NONE)
2688 {
2689 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2690 }
2691 currentVertexElement++;
2692 }
2693 else
2694 {
2695 #if USE_SIMD16_SHADERS
2696 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2697 #else
2698 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2699 #endif
2700 }
2701
2702 if (currentVertexElement > 3)
2703 {
2704 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2705 // reset to the next vVertexElement to output
2706 currentVertexElement = 0;
2707 }
2708 }
2709 }
2710 }
2711 else
2712 {
2713 SWR_INVALID("Unsupported conversion type");
2714 }
2715 }
2716
2717 #endif
2718 //////////////////////////////////////////////////////////////////////////
2719 /// @brief Output a simdvertex worth of elements to the current outputElt
2720 /// @param pVtxOut - base address of VIN output struct
2721 /// @param outputElt - simdvertex offset in VIN to write to
2722 /// @param numEltsToStore - number of simdvertex rows to write out
2723 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2724 #if USE_SIMD16_GATHERS
2725 void FetchJit::StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2726 {
2727 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2728
2729 for (uint32_t c = 0; c < numEltsToStore; ++c)
2730 {
2731 // STORE expects FP32 x vWidth type, just bitcast if needed
2732 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2733 {
2734 #if FETCH_DUMP_VERTEX
2735 PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2736 #endif
2737 vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty);
2738 }
2739 #if FETCH_DUMP_VERTEX
2740 else
2741 {
2742 PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2743 }
2744 #endif
2745 // outputElt * 4 = offsetting by the size of a simdvertex
2746 // + c offsets to a 32bit x vWidth row within the current vertex
2747 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2748 STORE(vVertexElements[c], dest);
2749 }
2750 }
2751
2752 #else
2753 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2754 {
2755 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2756
2757 for (uint32_t c = 0; c < numEltsToStore; ++c)
2758 {
2759 // STORE expects FP32 x vWidth type, just bitcast if needed
2760 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2761 {
2762 #if FETCH_DUMP_VERTEX
2763 PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2764 #endif
2765 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2766 }
2767 #if FETCH_DUMP_VERTEX
2768 else
2769 {
2770 PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2771 }
2772 #endif
2773 // outputElt * 4 = offsetting by the size of a simdvertex
2774 // + c offsets to a 32bit x vWidth row within the current vertex
2775 #if USE_SIMD16_SHADERS
2776 Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
2777 #else
2778 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2779 #endif
2780 STORE(vVertexElements[c], dest);
2781 }
2782 }
2783
2784 #endif
2785 //////////////////////////////////////////////////////////////////////////
2786 /// @brief Generates a constant vector of values based on the
2787 /// ComponentControl value
2788 /// @param ctrl - ComponentControl value
2789 #if USE_SIMD16_GATHERS
2790 Value *FetchJit::GenerateCompCtrlVector16(const ComponentControl ctrl)
2791 {
2792 switch (ctrl)
2793 {
2794 case NoStore:
2795 return VUNDEF_I_16();
2796 case Store0:
2797 return VIMMED1_16(0);
2798 case Store1Fp:
2799 return VIMMED1_16(1.0f);
2800 case Store1Int:
2801 return VIMMED1_16(1);
2802 case StoreVertexId:
2803 {
2804 Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2805 Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2806
2807 Value *pId = JOIN_16(pId_lo, pId_hi);
2808
2809 return pId;
2810 }
2811 case StoreInstanceId:
2812 {
2813 Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2814 return VBROADCAST_16(pId);
2815 }
2816
2817
2818 case StoreSrc:
2819 default:
2820 SWR_INVALID("Invalid component control");
2821 return VUNDEF_I_16();
2822 }
2823 }
2824
2825 #else
2826 #if USE_SIMD16_SHADERS
2827 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
2828 #else
2829 Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2830 #endif
2831 {
2832 switch (ctrl)
2833 {
2834 case NoStore:
2835 return VUNDEF_I();
2836 case Store0:
2837 return VIMMED1(0);
2838 case Store1Fp:
2839 return VIMMED1(1.0f);
2840 case Store1Int:
2841 return VIMMED1(1);
2842 case StoreVertexId:
2843 {
2844 #if USE_SIMD16_SHADERS
2845 Value *pId;
2846 if (useVertexID2)
2847 {
2848 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2849 }
2850 else
2851 {
2852 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2853 }
2854 #else
2855 Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2856 #endif
2857 return pId;
2858 }
2859 case StoreInstanceId:
2860 {
2861 Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2862 return VBROADCAST(pId);
2863 }
2864
2865
2866 case StoreSrc:
2867 default:
2868 SWR_INVALID("Invalid component control");
2869 return VUNDEF_I();
2870 }
2871 }
2872
2873 #endif
2874 //////////////////////////////////////////////////////////////////////////
2875 /// @brief Returns the enable mask for the specified component.
2876 /// @param enableMask - enable bits
2877 /// @param component - component to check if enabled.
2878 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2879 {
2880 switch (component)
2881 {
2882 // X
2883 case 0: return (enableMask & ComponentEnable::X);
2884 // Y
2885 case 1: return (enableMask & ComponentEnable::Y);
2886 // Z
2887 case 2: return (enableMask & ComponentEnable::Z);
2888 // W
2889 case 3: return (enableMask & ComponentEnable::W);
2890
2891 default: return false;
2892 }
2893 }
2894
2895 // Don't want two threads compiling the same fetch shader simultaneously
2896 // Has problems in the JIT cache implementation
2897 // This is only a problem for fetch right now.
2898 static std::mutex gFetchCodegenMutex;
2899
2900 //////////////////////////////////////////////////////////////////////////
2901 /// @brief JITs from fetch shader IR
2902 /// @param hJitMgr - JitManager handle
2903 /// @param func - LLVM function IR
2904 /// @return PFN_FETCH_FUNC - pointer to fetch code
2905 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2906 {
2907 const llvm::Function* func = (const llvm::Function*)hFunc;
2908 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2909 PFN_FETCH_FUNC pfnFetch;
2910
2911 gFetchCodegenMutex.lock();
2912 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2913 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2914 pJitMgr->mIsModuleFinalized = true;
2915
2916 #if defined(KNOB_SWRC_TRACING)
2917 char fName[1024];
2918 const char *funcName = func->getName().data();
2919 sprintf(fName, "%s.bin", funcName);
2920 FILE *fd = fopen(fName, "wb");
2921 fwrite((void *)pfnFetch, 1, 2048, fd);
2922 fclose(fd);
2923 #endif
2924
2925 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2926 gFetchCodegenMutex.unlock();
2927
2928
2929
2930 return pfnFetch;
2931 }
2932
2933 //////////////////////////////////////////////////////////////////////////
2934 /// @brief JIT compiles fetch shader
2935 /// @param hJitMgr - JitManager handle
2936 /// @param state - fetch state to build function from
2937 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2938 {
2939 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2940
2941 pJitMgr->SetupNewModule();
2942
2943 FetchJit theJit(pJitMgr);
2944 HANDLE hFunc = theJit.Create(state);
2945
2946 return JitFetchFunc(hJitMgr, hFunc);
2947 }