6c0e658e68f3c2484cd576331eacd263b71733dd
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "jit_api.h"
32 #include "fetch_jit.h"
33 #include "gen_state_llvm.h"
34 #include <sstream>
35 #include <tuple>
36
37 //#define FETCH_DUMP_VERTEX 1
38 using namespace llvm;
39 using namespace SwrJit;
40
41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43 enum ConversionType
44 {
45 CONVERT_NONE,
46 CONVERT_NORMALIZED,
47 CONVERT_USCALED,
48 CONVERT_SSCALED,
49 CONVERT_SFIXED,
50 };
51
52 //////////////////////////////////////////////////////////////////////////
53 /// Interface to Jitting a fetch shader
54 //////////////////////////////////////////////////////////////////////////
55 struct FetchJit : public Builder
56 {
57 FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
58
59 Function* Create(const FETCH_COMPILE_STATE& fetchState);
60 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
61 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
62 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
63
64 // package up Shuffle*bpcGatherd args into a tuple for convenience
65 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
66 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
67 const uint32_t(&)[4]> Shuffle8bpcArgs;
68 #if USE_SIMD16_SHADERS
69 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
70 #else
71 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
72 #endif
73 #if USE_SIMD16_BUILDER
74 void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args);
75 #endif
76
77 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
78 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
79 #if USE_SIMD16_SHADERS
80 void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
81 #else
82 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
83 #endif
84 #if USE_SIMD16_BUILDER
85 void Shuffle16bpcGather2(Shuffle16bpcArgs &args);
86 #endif
87
88 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
89 #if USE_SIMD16_BUILDER
90 void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
91 #endif
92
93 #if USE_SIMD16_SHADERS
94 Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
95 #else
96 Value* GenerateCompCtrlVector(const ComponentControl ctrl);
97 #endif
98 #if USE_SIMD16_BUILDER
99 Value* GenerateCompCtrlVector2(const ComponentControl ctrl);
100 #endif
101
102 void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
103 #if USE_SIMD16_SHADERS
104 #define USE_SIMD16_GATHERS 0
105
106 #if USE_SIMD16_GATHERS
107 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
108 #else
109 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
110 #endif
111 #else
112 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
113 #endif
114
115 bool IsOddFormat(SWR_FORMAT format);
116 bool IsUniformFormat(SWR_FORMAT format);
117 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
118 void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
119 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
120
121 Value* mpFetchInfo;
122 };
123
124 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
125 {
126 std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
127 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
128
129 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
130 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
131
132 fetch->getParent()->setModuleIdentifier(fetch->getName());
133
134 IRB()->SetInsertPoint(entry);
135
136 auto argitr = fetch->arg_begin();
137
138 // Fetch shader arguments
139 mpFetchInfo = &*argitr; ++argitr;
140 mpFetchInfo->setName("fetchInfo");
141 Value* pVtxOut = &*argitr;
142 pVtxOut->setName("vtxOutput");
143 // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
144 // index 0(just the pointer to the simdvertex structure
145 // index 1(which element of the simdvertex structure to offset to(in this case 0)
146 // so the indices being i32's doesn't matter
147 // TODO: generated this GEP with a VECTOR structure type so this makes sense
148 std::vector<Value*> vtxInputIndices(2, C(0));
149 // GEP
150 pVtxOut = GEP(pVtxOut, C(0));
151 #if USE_SIMD16_SHADERS
152 #if 0// USE_SIMD16_BUILDER
153 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
154 #else
155 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
156 #endif
157 #else
158 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
159 #endif
160
161 // SWR_FETCH_CONTEXT::pStreams
162 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
163 streams->setName("pStreams");
164
165 // SWR_FETCH_CONTEXT::pIndices
166 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
167 indices->setName("pIndices");
168
169 // SWR_FETCH_CONTEXT::pLastIndex
170 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
171 pLastIndex->setName("pLastIndex");
172
173
174 Value* vIndices;
175 #if USE_SIMD16_SHADERS
176 Value* indices2;
177 Value* vIndices2;
178 #endif
179 switch(fetchState.indexType)
180 {
181 case R8_UINT:
182 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
183 #if USE_SIMD16_SHADERS
184 indices2 = GEP(indices, C(8));
185 #endif
186 if(fetchState.bDisableIndexOOBCheck)
187 {
188 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
189 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
190 #if USE_SIMD16_SHADERS
191 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
192 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
193 #endif
194 }
195 else
196 {
197 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
198 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
199 #if USE_SIMD16_SHADERS
200 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
201 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
202 #endif
203 }
204 break;
205 case R16_UINT:
206 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
207 #if USE_SIMD16_SHADERS
208 indices2 = GEP(indices, C(8));
209 #endif
210 if(fetchState.bDisableIndexOOBCheck)
211 {
212 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
213 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
214 #if USE_SIMD16_SHADERS
215 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
216 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
217 #endif
218 }
219 else
220 {
221 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
222 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
223 #if USE_SIMD16_SHADERS
224 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
225 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
226 #endif
227 }
228 break;
229 case R32_UINT:
230 #if USE_SIMD16_SHADERS
231 indices2 = GEP(indices, C(8));
232 #endif
233 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
234 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
235 #if USE_SIMD16_SHADERS
236 (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
237 : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
238 #endif
239 break; // incoming type is already 32bit int
240 default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
241 }
242
243 if(fetchState.bForceSequentialAccessEnable)
244 {
245 Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
246
247 // VertexData buffers are accessed sequentially, the index is equal to the vertex number
248 vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
249 vIndices = ADD(vIndices, pOffsets);
250 #if USE_SIMD16_SHADERS
251 vIndices2 = ADD(vIndices, VIMMED1(8));
252 #endif
253 }
254
255 Value* vVertexId = vIndices;
256 #if USE_SIMD16_SHADERS
257 Value* vVertexId2 = vIndices2;
258 #endif
259 if (fetchState.bVertexIDOffsetEnable)
260 {
261 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
262 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
263 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
264 vVertexId = ADD(vIndices, vBaseVertex);
265 vVertexId = ADD(vVertexId, vStartVertex);
266 #if USE_SIMD16_SHADERS
267 vVertexId2 = ADD(vIndices2, vBaseVertex);
268 vVertexId2 = ADD(vVertexId2, vStartVertex);
269 #endif
270 }
271
272 // store out vertex IDs
273 STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
274 #if USE_SIMD16_SHADERS
275 STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
276 #endif
277
278 // store out cut mask if enabled
279 if (fetchState.bEnableCutIndex)
280 {
281 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
282 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
283 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
284 #if USE_SIMD16_SHADERS
285 Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
286 STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
287 #endif
288 }
289
290 // Fetch attributes from memory and output to a simdvertex struct
291 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
292 #if USE_SIMD16_SHADERS
293 if (fetchState.bDisableVGATHER)
294 {
295 JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
296 JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
297 }
298 else
299 {
300 #if USE_SIMD16_GATHERS
301 JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
302 #else
303 JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
304 JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
305 #endif
306 }
307 #else
308 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
309 : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
310 #endif
311
312 RET_VOID();
313
314 JitManager::DumpToFile(fetch, "src");
315
316 #if defined(_DEBUG)
317 verifyFunction(*fetch);
318 #endif
319
320 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
321
322 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
323 setupPasses.add(createBreakCriticalEdgesPass());
324 setupPasses.add(createCFGSimplificationPass());
325 setupPasses.add(createEarlyCSEPass());
326 setupPasses.add(createPromoteMemoryToRegisterPass());
327
328 setupPasses.run(*fetch);
329
330 JitManager::DumpToFile(fetch, "se");
331
332 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
333
334 ///@todo Haven't touched these either. Need to remove some of these and add others.
335 optPasses.add(createCFGSimplificationPass());
336 optPasses.add(createEarlyCSEPass());
337 optPasses.add(createInstructionCombiningPass());
338 optPasses.add(createInstructionSimplifierPass());
339 optPasses.add(createConstantPropagationPass());
340 optPasses.add(createSCCPPass());
341 optPasses.add(createAggressiveDCEPass());
342
343 optPasses.run(*fetch);
344 optPasses.run(*fetch);
345
346 JitManager::DumpToFile(fetch, "opt");
347
348 return fetch;
349 }
350
351 //////////////////////////////////////////////////////////////////////////
352 /// @brief Loads attributes from memory using LOADs, shuffling the
353 /// components into SOA form.
354 /// *Note* currently does not support component control,
355 /// component packing, instancing
356 /// @param fetchState - info about attributes to be fetched from memory
357 /// @param streams - value pointer to the current vertex stream
358 /// @param vIndices - vector value of indices to load
359 /// @param pVtxOut - value pointer to output simdvertex struct
360 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
361 {
362 // Zack shuffles; a variant of the Charleston.
363
364 std::vector<Value*> vectors(16);
365 std::vector<Constant*> pMask(mVWidth);
366 for(uint32_t i = 0; i < mVWidth; ++i)
367 {
368 pMask[i] = (C(i < 4 ? i : 4));
369 }
370 Constant* promoteMask = ConstantVector::get(pMask);
371 Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
372
373 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
374 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
375 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
376 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
377 curInstance->setName("curInstance");
378
379 for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
380 {
381 Value* elements[4] = {0};
382 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
383 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
384 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
385 uint32_t numComponents = info.numComps;
386 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
387
388 // load path doesn't support component packing
389 SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
390
391 vectors.clear();
392
393 if (fetchState.bInstanceIDOffsetEnable)
394 {
395 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
396 }
397
398 Value *vCurIndices;
399 Value *startOffset;
400 if(ied.InstanceEnable)
401 {
402 Value* stepRate = C(ied.InstanceAdvancementState);
403
404 // prevent a div by 0 for 0 step rate
405 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
406 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
407
408 // calc the current offset into instanced data buffer
409 Value* calcInstance = UDIV(curInstance, stepRate);
410
411 // if step rate is 0, every instance gets instance 0
412 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
413
414 vCurIndices = VBROADCAST(calcInstance);
415
416 startOffset = startInstance;
417 }
418 else if (ied.InstanceStrideEnable)
419 {
420 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
421 }
422 else
423 {
424 // offset indices by baseVertex
425 vCurIndices = ADD(vIndices, vBaseVertex);
426
427 startOffset = startVertex;
428 }
429
430 // load SWR_VERTEX_BUFFER_STATE::pData
431 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
432
433 // load SWR_VERTEX_BUFFER_STATE::pitch
434 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
435 stride = Z_EXT(stride, mInt64Ty);
436
437 // load SWR_VERTEX_BUFFER_STATE::size
438 Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
439 size = Z_EXT(size, mInt64Ty);
440
441 Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
442
443 Value *minVertex = NULL;
444 Value *minVertexOffset = NULL;
445 if (fetchState.bPartialVertexBuffer) {
446 // fetch min index for low bounds checking
447 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
448 minVertex = LOAD(minVertex);
449 if (!fetchState.bDisableIndexOOBCheck) {
450 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
451 }
452 }
453
454 // Load from the stream.
455 for(uint32_t lane = 0; lane < mVWidth; ++lane)
456 {
457 // Get index
458 Value* index = VEXTRACT(vCurIndices, C(lane));
459
460 if (fetchState.bPartialVertexBuffer) {
461 // clamp below minvertex
462 Value *isBelowMin = ICMP_SLT(index, minVertex);
463 index = SELECT(isBelowMin, minVertex, index);
464 }
465
466 index = Z_EXT(index, mInt64Ty);
467
468 Value* offset = MUL(index, stride);
469 offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
470 offset = ADD(offset, startVertexOffset);
471
472 if (!fetchState.bDisableIndexOOBCheck) {
473 // check for out of bound access, including partial OOB, and replace them with minVertex
474 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
475 Value *oob = ICMP_ULE(endOffset, size);
476 if (fetchState.bPartialVertexBuffer) {
477 offset = SELECT(oob, offset, minVertexOffset);
478 } else {
479 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
480 }
481 }
482
483 Value* pointer = GEP(stream, offset);
484 // We use a full-lane, but don't actually care.
485 Value* vptr = 0;
486
487 // get a pointer to a 4 component attrib in default address space
488 switch(bpc)
489 {
490 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
491 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
492 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
493 default: SWR_INVALID("Unsupported underlying bpp!");
494 }
495
496 // load 4 components of attribute
497 Value* vec = ALIGNED_LOAD(vptr, 1, false);
498
499 // Convert To FP32 internally
500 switch(info.type[0])
501 {
502 case SWR_TYPE_UNORM:
503 switch(bpc)
504 {
505 case 8:
506 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
507 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
508 break;
509 case 16:
510 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
511 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
512 break;
513 default:
514 SWR_INVALID("Unsupported underlying type!");
515 break;
516 }
517 break;
518 case SWR_TYPE_SNORM:
519 switch(bpc)
520 {
521 case 8:
522 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
523 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
524 break;
525 case 16:
526 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
527 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
528 break;
529 default:
530 SWR_INVALID("Unsupported underlying type!");
531 break;
532 }
533 break;
534 case SWR_TYPE_UINT:
535 // Zero extend uint32_t types.
536 switch(bpc)
537 {
538 case 8:
539 case 16:
540 vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
541 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
542 break;
543 case 32:
544 break; // Pass through unchanged.
545 default:
546 SWR_INVALID("Unsupported underlying type!");
547 break;
548 }
549 break;
550 case SWR_TYPE_SINT:
551 // Sign extend SINT types.
552 switch(bpc)
553 {
554 case 8:
555 case 16:
556 vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
557 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
558 break;
559 case 32:
560 break; // Pass through unchanged.
561 default:
562 SWR_INVALID("Unsupported underlying type!");
563 break;
564 }
565 break;
566 case SWR_TYPE_FLOAT:
567 switch(bpc)
568 {
569 case 32:
570 break; // Pass through unchanged.
571 default:
572 SWR_INVALID("Unsupported underlying type!");
573 }
574 break;
575 case SWR_TYPE_USCALED:
576 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
577 break;
578 case SWR_TYPE_SSCALED:
579 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
580 break;
581 case SWR_TYPE_SFIXED:
582 vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
583 break;
584 case SWR_TYPE_UNKNOWN:
585 case SWR_TYPE_UNUSED:
586 SWR_INVALID("Unsupported type %d!", info.type[0]);
587 }
588
589 // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
590 // uwvec: 4 x F32, undef value
591 Value* wvec = VSHUFFLE(vec, uwvec, promoteMask);
592 vectors.push_back(wvec);
593 }
594
595 std::vector<Constant*> v01Mask(mVWidth);
596 std::vector<Constant*> v23Mask(mVWidth);
597 std::vector<Constant*> v02Mask(mVWidth);
598 std::vector<Constant*> v13Mask(mVWidth);
599
600 // Concatenate the vectors together.
601 elements[0] = VUNDEF_F();
602 elements[1] = VUNDEF_F();
603 elements[2] = VUNDEF_F();
604 elements[3] = VUNDEF_F();
605 for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
606 {
607 v01Mask[4 * b + 0] = C(0 + 4 * b);
608 v01Mask[4 * b + 1] = C(1 + 4 * b);
609 v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
610 v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
611
612 v23Mask[4 * b + 0] = C(2 + 4 * b);
613 v23Mask[4 * b + 1] = C(3 + 4 * b);
614 v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
615 v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
616
617 v02Mask[4 * b + 0] = C(0 + 4 * b);
618 v02Mask[4 * b + 1] = C(2 + 4 * b);
619 v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
620 v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
621
622 v13Mask[4 * b + 0] = C(1 + 4 * b);
623 v13Mask[4 * b + 1] = C(3 + 4 * b);
624 v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
625 v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
626
627 std::vector<Constant*> iMask(mVWidth);
628 for(uint32_t i = 0; i < mVWidth; ++i)
629 {
630 if(((4 * b) <= i) && (i < (4 * (b + 1))))
631 {
632 iMask[i] = C(i % 4 + mVWidth);
633 }
634 else
635 {
636 iMask[i] = C(i);
637 }
638 }
639 Constant* insertMask = ConstantVector::get(iMask);
640 elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
641 elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
642 elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
643 elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
644 }
645
646 Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
647 Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
648 Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
649 Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
650 elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
651 elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
652 elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
653 elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
654
655 switch(numComponents + 1)
656 {
657 case 1: elements[0] = VIMMED1(0.0f);
658 case 2: elements[1] = VIMMED1(0.0f);
659 case 3: elements[2] = VIMMED1(0.0f);
660 case 4: elements[3] = VIMMED1(1.0f);
661 }
662
663 for(uint32_t c = 0; c < 4; ++c)
664 {
665 #if USE_SIMD16_SHADERS
666 Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
667 #else
668 Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
669 #endif
670 STORE(elements[c], dest);
671 }
672 }
673 }
674
675 // returns true for odd formats that require special state.gather handling
676 bool FetchJit::IsOddFormat(SWR_FORMAT format)
677 {
678 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
679 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
680 {
681 return true;
682 }
683 return false;
684 }
685
686 // format is uniform if all components are the same size and type
687 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
688 {
689 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
690 uint32_t bpc0 = info.bpc[0];
691 uint32_t type0 = info.type[0];
692
693 for (uint32_t c = 1; c < info.numComps; ++c)
694 {
695 if (bpc0 != info.bpc[c] || type0 != info.type[c])
696 {
697 return false;
698 }
699 }
700 return true;
701 }
702
703 // unpacks components based on format
704 // foreach component in the pixel
705 // mask off everything but this component
706 // shift component to LSB
707 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
708 {
709 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
710
711 uint32_t bitOffset = 0;
712 for (uint32_t c = 0; c < info.numComps; ++c)
713 {
714 uint32_t swizzledIndex = info.swizzle[c];
715 uint32_t compBits = info.bpc[c];
716 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
717 Value* comp = AND(vInput, bitmask);
718 comp = LSHR(comp, bitOffset);
719
720 result[swizzledIndex] = comp;
721 bitOffset += compBits;
722 }
723 }
724
725 // gather for odd component size formats
726 // gather SIMD full pixels per lane then shift/mask to move each component to their
727 // own vector
728 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
729 {
730 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
731
732 // only works if pixel size is <= 32bits
733 SWR_ASSERT(info.bpp <= 32);
734
735 Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
736
737 for (uint32_t comp = 0; comp < 4; ++comp)
738 {
739 pResult[comp] = VIMMED1((int)info.defaults[comp]);
740 }
741
742 UnpackComponents(format, pGather, pResult);
743
744 // cast to fp32
745 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
746 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
747 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
748 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
749 }
750
751 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
752 {
753 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
754
755 for (uint32_t c = 0; c < info.numComps; ++c)
756 {
757 uint32_t compIndex = info.swizzle[c];
758
759 // skip any conversion on UNUSED components
760 if (info.type[c] == SWR_TYPE_UNUSED)
761 {
762 continue;
763 }
764
765 if (info.isNormalized[c])
766 {
767 if (info.type[c] == SWR_TYPE_SNORM)
768 {
769 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
770
771 /// result = c * (1.0f / (2^(n-1) - 1);
772 uint32_t n = info.bpc[c];
773 uint32_t pow2 = 1 << (n - 1);
774 float scale = 1.0f / (float)(pow2 - 1);
775 Value *vScale = VIMMED1(scale);
776 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
777 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
778 texels[compIndex] = FMUL(texels[compIndex], vScale);
779 }
780 else
781 {
782 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
783
784 /// result = c * (1.0f / (2^n - 1))
785 uint32_t n = info.bpc[c];
786 uint32_t pow2 = 1 << n;
787 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
788 if (n == 24)
789 {
790 float scale = (float)(pow2 - 1);
791 Value* vScale = VIMMED1(scale);
792 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
793 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
794 texels[compIndex] = FDIV(texels[compIndex], vScale);
795 }
796 else
797 {
798 float scale = 1.0f / (float)(pow2 - 1);
799 Value *vScale = VIMMED1(scale);
800 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
801 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
802 texels[compIndex] = FMUL(texels[compIndex], vScale);
803 }
804 }
805 continue;
806 }
807 }
808 }
809
810 //////////////////////////////////////////////////////////////////////////
811 /// @brief Loads attributes from memory using AVX2 GATHER(s)
812 /// @param fetchState - info about attributes to be fetched from memory
813 /// @param streams - value pointer to the current vertex stream
814 /// @param vIndices - vector value of indices to gather
815 /// @param pVtxOut - value pointer to output simdvertex struct
816 #if USE_SIMD16_SHADERS
817 #if USE_SIMD16_GATHERS
818 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
819 Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
820 #else
821 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
822 Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
823 #endif
824 #else
825 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
826 Value* streams, Value* vIndices, Value* pVtxOut)
827 #endif
828 {
829 uint32_t currentVertexElement = 0;
830 uint32_t outputElt = 0;
831 Value* vVertexElements[4];
832 #if USE_SIMD16_GATHERS
833 Value* vVertexElements2[4];
834 #if USE_SIMD16_BUILDER
835 Value *pVtxSrc2[4];
836 #endif
837 #endif
838
839 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
840 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
841 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
842 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
843 curInstance->setName("curInstance");
844
845 for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
846 {
847 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
848
849 // skip element if all components are disabled
850 if (ied.ComponentPacking == ComponentEnable::NONE)
851 {
852 continue;
853 }
854
855 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
856 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
857 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
858
859 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
860
861 // VGATHER* takes an *i8 src pointer
862 Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
863
864 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
865 Value *vStride = VBROADCAST(stride);
866
867 // max vertex index that is fully in bounds
868 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
869 maxVertex = LOAD(maxVertex);
870
871 Value *minVertex = NULL;
872 if (fetchState.bPartialVertexBuffer)
873 {
874 // min vertex index for low bounds OOB checking
875 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
876 minVertex = LOAD(minVertex);
877 }
878
879 if (fetchState.bInstanceIDOffsetEnable)
880 {
881 // the InstanceID (curInstance) value is offset by StartInstanceLocation
882 curInstance = ADD(curInstance, startInstance);
883 }
884
885 Value *vCurIndices;
886 #if USE_SIMD16_GATHERS
887 Value *vCurIndices2;
888 #endif
889 Value *startOffset;
890 Value *vInstanceStride = VIMMED1(0);
891
892 if (ied.InstanceEnable)
893 {
894 Value* stepRate = C(ied.InstanceAdvancementState);
895
896 // prevent a div by 0 for 0 step rate
897 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
898 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
899
900 // calc the current offset into instanced data buffer
901 Value* calcInstance = UDIV(curInstance, stepRate);
902
903 // if step rate is 0, every instance gets instance 0
904 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
905
906 vCurIndices = VBROADCAST(calcInstance);
907 #if USE_SIMD16_GATHERS
908 vCurIndices2 = VBROADCAST(calcInstance);
909 #endif
910
911 startOffset = startInstance;
912 }
913 else if (ied.InstanceStrideEnable)
914 {
915 // grab the instance advancement state, determines stride in bytes from one instance to the next
916 Value* stepRate = C(ied.InstanceAdvancementState);
917 vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
918
919 // offset indices by baseVertex
920 vCurIndices = ADD(vIndices, vBaseVertex);
921 #if USE_SIMD16_GATHERS
922 vCurIndices2 = ADD(vIndices2, vBaseVertex);
923 #endif
924
925 startOffset = startVertex;
926 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
927 }
928 else
929 {
930 // offset indices by baseVertex
931 vCurIndices = ADD(vIndices, vBaseVertex);
932 #if USE_SIMD16_GATHERS
933 vCurIndices2 = ADD(vIndices2, vBaseVertex);
934 #endif
935
936 startOffset = startVertex;
937 }
938
939 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
940 // do 64bit address offset calculations.
941
942 // calculate byte offset to the start of the VB
943 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
944 pStreamBase = GEP(pStreamBase, baseOffset);
945
946 // if we have a start offset, subtract from max vertex. Used for OOB check
947 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
948 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
949 // if we have a negative value, we're already OOB. clamp at 0.
950 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
951
952 if (fetchState.bPartialVertexBuffer)
953 {
954 // similary for min vertex
955 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
956 Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
957 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
958 }
959
960 // Load the in bounds size of a partially valid vertex
961 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
962 partialInboundsSize = LOAD(partialInboundsSize);
963 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
964 Value* vBpp = VBROADCAST(C(info.Bpp));
965 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
966
967 // is the element is <= the partially valid size
968 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
969
970 #if USE_SIMD16_GATHERS
971 // override cur indices with 0 if pitch is 0
972 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
973 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
974 vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
975
976 // are vertices partially OOB?
977 Value* vMaxVertex = VBROADCAST(maxVertex);
978 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
979 Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex);
980
981 // are vertices fully in bounds?
982 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
983 Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex);
984
985 Value *vGatherMask;
986 Value *vGatherMask2;
987 if (fetchState.bPartialVertexBuffer)
988 {
989 // are vertices below minVertex limit?
990 Value *vMinVertex = VBROADCAST(minVertex);
991 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
992 Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex);
993
994 // only fetch lanes that pass both tests
995 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
996 vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2);
997 }
998 else
999 {
1000 vGatherMask = vMaxGatherMask;
1001 vGatherMask2 = vMaxGatherMask2;
1002 }
1003
1004 // blend in any partially OOB indices that have valid elements
1005 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1006 vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2);
1007 Value *pMask = vGatherMask;
1008 Value *pMask2 = vGatherMask2;
1009 vGatherMask = VMASK(vGatherMask);
1010 vGatherMask2 = VMASK(vGatherMask2);
1011
1012 // calculate the actual offsets into the VB
1013 Value* vOffsets = MUL(vCurIndices, vStride);
1014 vOffsets = ADD(vOffsets, vAlignmentOffsets);
1015
1016 Value* vOffsets2 = MUL(vCurIndices2, vStride);
1017 vOffsets2 = ADD(vOffsets2, vAlignmentOffsets);
1018
1019 // if instance stride enable is:
1020 // true - add product of the instanceID and advancement state to the offst into the VB
1021 // false - value of vInstanceStride has been initialialized to zero
1022 vOffsets = ADD(vOffsets, vInstanceStride);
1023 vOffsets2 = ADD(vOffsets2, vInstanceStride);
1024
1025 #else
1026 // override cur indices with 0 if pitch is 0
1027 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1028 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1029
1030 // are vertices partially OOB?
1031 Value* vMaxVertex = VBROADCAST(maxVertex);
1032 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1033
1034 // are vertices fully in bounds?
1035 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1036
1037 Value *vGatherMask;
1038 if (fetchState.bPartialVertexBuffer)
1039 {
1040 // are vertices below minVertex limit?
1041 Value *vMinVertex = VBROADCAST(minVertex);
1042 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1043
1044 // only fetch lanes that pass both tests
1045 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1046 }
1047 else
1048 {
1049 vGatherMask = vMaxGatherMask;
1050 }
1051
1052 // blend in any partially OOB indices that have valid elements
1053 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1054 Value* pMask = vGatherMask;
1055 vGatherMask = VMASK(vGatherMask);
1056
1057 // calculate the actual offsets into the VB
1058 Value* vOffsets = MUL(vCurIndices, vStride);
1059 vOffsets = ADD(vOffsets, vAlignmentOffsets);
1060
1061 // if instance stride enable is:
1062 // true - add product of the instanceID and advancement state to the offst into the VB
1063 // false - value of vInstanceStride has been initialialized to zero
1064 vOffsets = ADD(vOffsets, vInstanceStride);
1065
1066 #endif
1067 // Packing and component control
1068 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1069 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1070 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1071
1072 // Special gather/conversion for formats without equal component sizes
1073 if (IsOddFormat((SWR_FORMAT)ied.Format))
1074 {
1075 #if USE_SIMD16_GATHERS
1076 Value *pResults[4];
1077 Value *pResults2[4];
1078 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1079 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1080 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1081 ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1082
1083 for (uint32_t c = 0; c < 4; c += 1)
1084 {
1085 if (isComponentEnabled(compMask, c))
1086 {
1087 #if USE_SIMD16_BUILDER
1088 // pack adjacent pairs of SIMD8s into SIMD16s
1089 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1090 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c], 0);
1091 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1);
1092
1093 #else
1094 vVertexElements[currentVertexElement] = pResults[c];
1095 vVertexElements2[currentVertexElement] = pResults2[c];
1096
1097 #endif
1098 currentVertexElement += 1;
1099
1100 if (currentVertexElement > 3)
1101 {
1102 #if USE_SIMD16_BUILDER
1103 // store SIMD16s
1104 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1105
1106 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1107
1108 #else
1109 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1110 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1111
1112 #endif
1113 outputElt += 1;
1114
1115 // reset to the next vVertexElement to output
1116 currentVertexElement = 0;
1117 }
1118 }
1119 }
1120 #else
1121 Value* pResults[4];
1122 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1123 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1124
1125 for (uint32_t c = 0; c < 4; ++c)
1126 {
1127 if (isComponentEnabled(compMask, c))
1128 {
1129 vVertexElements[currentVertexElement++] = pResults[c];
1130 if (currentVertexElement > 3)
1131 {
1132 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1133 // reset to the next vVertexElement to output
1134 currentVertexElement = 0;
1135 }
1136 }
1137 }
1138 #endif
1139 }
1140 else if(info.type[0] == SWR_TYPE_FLOAT)
1141 {
1142 ///@todo: support 64 bit vb accesses
1143 Value *gatherSrc = VIMMED1(0.0f);
1144 #if USE_SIMD16_GATHERS
1145 Value *gatherSrc2 = VIMMED1(0.0f);
1146 #if USE_SIMD16_BUILDER
1147 Value *gatherSrc16 = VIMMED2_1(0.0f);
1148 #endif
1149 #endif
1150
1151 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1152 "Unsupported format for standard gather fetch.");
1153
1154 // Gather components from memory to store in a simdvertex structure
1155 switch (bpc)
1156 {
1157 case 16:
1158 {
1159 #if USE_SIMD16_GATHERS
1160 Value *vGatherResult[2];
1161 Value *vGatherResult2[2];
1162
1163 // if we have at least one component out of x or y to fetch
1164 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1165 {
1166 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1167 vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1168 // e.g. result of first 8x32bit integer gather for 16bit components
1169 // 256i - 0 1 2 3 4 5 6 7
1170 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1171 //
1172 }
1173 else
1174 {
1175 vGatherResult[0] = VUNDEF_I();
1176 vGatherResult2[0] = VUNDEF_I();
1177 }
1178
1179 // if we have at least one component out of z or w to fetch
1180 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1181 {
1182 // offset base to the next components(zw) in the vertex to gather
1183 pStreamBase = GEP(pStreamBase, C((char)4));
1184
1185 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1186 vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1187 // e.g. result of second 8x32bit integer gather for 16bit components
1188 // 256i - 0 1 2 3 4 5 6 7
1189 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1190 //
1191 }
1192 else
1193 {
1194 vGatherResult[1] = VUNDEF_I();
1195 vGatherResult2[1] = VUNDEF_I();
1196 }
1197
1198 // if we have at least one component to shuffle into place
1199 if (compMask)
1200 {
1201 #if USE_SIMD16_BUILDER
1202 Value *gatherResult[2];
1203
1204 gatherResult[0] = VUNDEF2_I();
1205 gatherResult[1] = VUNDEF2_I();
1206
1207 gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0);
1208 gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
1209
1210 gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0);
1211 gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
1212
1213 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1214
1215 Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
1216 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1217
1218 // Shuffle gathered components into place in simdvertex struct
1219 Shuffle16bpcGather2(args); // outputs to vVertexElements ref
1220 #else
1221 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1222 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1223 Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
1224 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1225
1226 // Shuffle gathered components into place in simdvertex struct
1227 Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
1228 Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
1229 #endif
1230 }
1231 #else
1232 Value* vGatherResult[2];
1233
1234 // if we have at least one component out of x or y to fetch
1235 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1236 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1237 // e.g. result of first 8x32bit integer gather for 16bit components
1238 // 256i - 0 1 2 3 4 5 6 7
1239 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1240 //
1241 }
1242
1243 // if we have at least one component out of z or w to fetch
1244 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1245 // offset base to the next components(zw) in the vertex to gather
1246 pStreamBase = GEP(pStreamBase, C((char)4));
1247
1248 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1249 // e.g. result of second 8x32bit integer gather for 16bit components
1250 // 256i - 0 1 2 3 4 5 6 7
1251 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1252 //
1253 }
1254
1255 // if we have at least one component to shuffle into place
1256 if(compMask){
1257 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1258 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1259
1260 // Shuffle gathered components into place in simdvertex struct
1261 #if USE_SIMD16_SHADERS
1262 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
1263 #else
1264 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1265 #endif
1266 }
1267 #endif
1268 }
1269 break;
1270 case 32:
1271 {
1272 for (uint32_t i = 0; i < 4; i += 1)
1273 {
1274 #if USE_SIMD16_GATHERS
1275 if (isComponentEnabled(compMask, i))
1276 {
1277 // if we need to gather the component
1278 if (compCtrl[i] == StoreSrc)
1279 {
1280 // Gather a SIMD of vertices
1281 // APIs allow a 4GB range for offsets
1282 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1283 // But, we know that elements must be aligned for FETCH. :)
1284 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1285 Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
1286 Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
1287 #if USE_SIMD16_BUILDER
1288 Value *indices = VUNDEF2_I();
1289 indices = INSERT2_I(indices, vShiftedOffsets, 0);
1290 indices = INSERT2_I(indices, vShiftedOffsets2, 1);
1291
1292 Value *mask = VUNDEF2_I();
1293 mask = INSERT2_I(mask, vGatherMask, 0);
1294 mask = INSERT2_I(mask, vGatherMask2, 1);
1295
1296 pVtxSrc2[currentVertexElement] = GATHERPS2(gatherSrc16, pStreamBase, indices, mask, 2);
1297 #else
1298 vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1299 vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
1300
1301 #if USE_SIMD16_BUILDER
1302 // pack adjacent pairs of SIMD8s into SIMD16s
1303 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1304 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement], 0);
1305 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1306
1307 #endif
1308 #endif
1309 currentVertexElement += 1;
1310 }
1311 else
1312 {
1313 #if USE_SIMD16_BUILDER
1314 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1315 #else
1316 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1317 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1318
1319 #if USE_SIMD16_BUILDER
1320 // pack adjacent pairs of SIMD8s into SIMD16s
1321 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1322 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement], 0);
1323 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1324
1325 #endif
1326 #endif
1327 currentVertexElement += 1;
1328 }
1329
1330 if (currentVertexElement > 3)
1331 {
1332 #if USE_SIMD16_BUILDER
1333 // store SIMD16s
1334 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1335
1336 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1337
1338 #else
1339 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1340 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1341
1342 #endif
1343 outputElt += 1;
1344
1345 // reset to the next vVertexElement to output
1346 currentVertexElement = 0;
1347 }
1348 }
1349
1350 // offset base to the next component in the vertex to gather
1351 pStreamBase = GEP(pStreamBase, C((char)4));
1352 #else
1353 if (isComponentEnabled(compMask, i))
1354 {
1355 // if we need to gather the component
1356 if (compCtrl[i] == StoreSrc)
1357 {
1358 // Gather a SIMD of vertices
1359 // APIs allow a 4GB range for offsets
1360 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1361 // But, we know that elements must be aligned for FETCH. :)
1362 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1363 Value* vShiftedOffsets = VPSRLI(vOffsets, C(1));
1364 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1365 }
1366 else
1367 {
1368 #if USE_SIMD16_SHADERS
1369 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1370 #else
1371 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1372 #endif
1373 }
1374
1375 if (currentVertexElement > 3)
1376 {
1377 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1378 // reset to the next vVertexElement to output
1379 currentVertexElement = 0;
1380 }
1381 }
1382
1383 // offset base to the next component in the vertex to gather
1384 pStreamBase = GEP(pStreamBase, C((char)4));
1385 #endif
1386 }
1387 }
1388 break;
1389 case 64:
1390 {
1391 for (uint32_t i = 0; i < 4; i += 1)
1392 {
1393 #if USE_SIMD16_GATHERS
1394 if (isComponentEnabled(compMask, i))
1395 {
1396 // if we need to gather the component
1397 if (compCtrl[i] == StoreSrc)
1398 {
1399 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1400 Value *vMaskLo2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1401 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1402 Value *vMaskHi2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1403 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1404 vMaskLo2 = S_EXT(vMaskLo2, VectorType::get(mInt64Ty, 4));
1405 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1406 vMaskHi2 = S_EXT(vMaskHi2, VectorType::get(mInt64Ty, 4));
1407 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1408 vMaskLo2 = BITCAST(vMaskLo2, VectorType::get(mDoubleTy, 4));
1409 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1410 vMaskHi2 = BITCAST(vMaskHi2, VectorType::get(mDoubleTy, 4));
1411
1412 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1413 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1414 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1415 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1416
1417 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1418
1419 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1420 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1421 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1422 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1423
1424 pGatherLo = VCVTPD2PS(pGatherLo);
1425 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1426 pGatherHi = VCVTPD2PS(pGatherHi);
1427 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1428
1429 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1430 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1431
1432 #if USE_SIMD16_BUILDER
1433 // pack adjacent pairs of SIMD8s into SIMD16s
1434 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1435 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0);
1436 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
1437
1438 #else
1439 vVertexElements[currentVertexElement] = pGather;
1440 vVertexElements2[currentVertexElement] = pGather2;
1441
1442 #endif
1443 currentVertexElement += 1;
1444 }
1445 else
1446 {
1447 #if USE_SIMD16_BUILDER
1448 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1449
1450 #else
1451 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1452 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1453
1454 #endif
1455 currentVertexElement += 1;
1456 }
1457
1458 if (currentVertexElement > 3)
1459 {
1460 #if USE_SIMD16_BUILDER
1461 // store SIMD16s
1462 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1463
1464 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1465
1466 #else
1467 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1468 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1469
1470 #endif
1471 outputElt += 1;
1472
1473 // reset to the next vVertexElement to output
1474 currentVertexElement = 0;
1475 }
1476 }
1477
1478 // offset base to the next component in the vertex to gather
1479 pStreamBase = GEP(pStreamBase, C((char)8));
1480 #else
1481 if (isComponentEnabled(compMask, i))
1482 {
1483 // if we need to gather the component
1484 if (compCtrl[i] == StoreSrc)
1485 {
1486 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1487 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1488 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1489 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1490 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1491 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1492
1493 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1494 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1495
1496 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1497
1498 Value* pGatherLo = GATHERPD(vZeroDouble,
1499 pStreamBase, vOffsetsLo, vMaskLo);
1500 Value* pGatherHi = GATHERPD(vZeroDouble,
1501 pStreamBase, vOffsetsHi, vMaskHi);
1502
1503 pGatherLo = VCVTPD2PS(pGatherLo);
1504 pGatherHi = VCVTPD2PS(pGatherHi);
1505
1506 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1507
1508 vVertexElements[currentVertexElement++] = pGather;
1509 }
1510 else
1511 {
1512 #if USE_SIMD16_SHADERS
1513 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1514 #else
1515 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1516 #endif
1517 }
1518
1519 if (currentVertexElement > 3)
1520 {
1521 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1522 // reset to the next vVertexElement to output
1523 currentVertexElement = 0;
1524 }
1525 }
1526
1527 // offset base to the next component in the vertex to gather
1528 pStreamBase = GEP(pStreamBase, C((char)8));
1529 #endif
1530 }
1531 }
1532 break;
1533 default:
1534 SWR_INVALID("Tried to fetch invalid FP format");
1535 break;
1536 }
1537 }
1538 else
1539 {
1540 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1541 ConversionType conversionType = CONVERT_NONE;
1542
1543 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1544 "Unsupported format for standard gather fetch.");
1545
1546 switch(info.type[0])
1547 {
1548 case SWR_TYPE_UNORM:
1549 conversionType = CONVERT_NORMALIZED;
1550 case SWR_TYPE_UINT:
1551 extendCastType = Instruction::CastOps::ZExt;
1552 break;
1553 case SWR_TYPE_SNORM:
1554 conversionType = CONVERT_NORMALIZED;
1555 case SWR_TYPE_SINT:
1556 extendCastType = Instruction::CastOps::SExt;
1557 break;
1558 case SWR_TYPE_USCALED:
1559 conversionType = CONVERT_USCALED;
1560 extendCastType = Instruction::CastOps::UIToFP;
1561 break;
1562 case SWR_TYPE_SSCALED:
1563 conversionType = CONVERT_SSCALED;
1564 extendCastType = Instruction::CastOps::SIToFP;
1565 break;
1566 case SWR_TYPE_SFIXED:
1567 conversionType = CONVERT_SFIXED;
1568 extendCastType = Instruction::CastOps::SExt;
1569 break;
1570 default:
1571 break;
1572 }
1573
1574 // value substituted when component of gather is masked
1575 Value* gatherSrc = VIMMED1(0);
1576 #if USE_SIMD16_GATHERS
1577 Value* gatherSrc2 = VIMMED1(0);
1578 #endif
1579
1580 // Gather components from memory to store in a simdvertex structure
1581 switch (bpc)
1582 {
1583 case 8:
1584 {
1585 // if we have at least one component to fetch
1586 if (compMask)
1587 {
1588 #if USE_SIMD16_GATHERS
1589 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1590 Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1591
1592 // e.g. result of an 8x32bit integer gather for 8bit components
1593 // 256i - 0 1 2 3 4 5 6 7
1594 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1595
1596 #if USE_SIMD16_BUILDER
1597 Value *gatherResult = VUNDEF2_I();
1598
1599 gatherResult = INSERT2_I(gatherResult, vGatherResult, 0);
1600 gatherResult = INSERT2_I(gatherResult, vGatherResult2, 1);
1601
1602 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1603
1604 Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1605 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
1606
1607 // Shuffle gathered components into place in simdvertex struct
1608 Shuffle8bpcGatherd2(args); // outputs to vVertexElements ref
1609 #else
1610 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1611 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1612 Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1613 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle);
1614
1615 // Shuffle gathered components into place in simdvertex struct
1616 Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
1617 Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
1618 #endif
1619 #else
1620 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1621 // e.g. result of an 8x32bit integer gather for 8bit components
1622 // 256i - 0 1 2 3 4 5 6 7
1623 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1624
1625 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1626 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1627
1628 // Shuffle gathered components into place in simdvertex struct
1629 #if USE_SIMD16_SHADERS
1630 Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1631 #else
1632 Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1633 #endif
1634 #endif
1635 }
1636 }
1637 break;
1638 case 16:
1639 {
1640 #if USE_SIMD16_GATHERS
1641 Value* vGatherResult[2];
1642 Value* vGatherResult2[2];
1643
1644 // if we have at least one component out of x or y to fetch
1645 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1646 {
1647 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1648 vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1649 // e.g. result of first 8x32bit integer gather for 16bit components
1650 // 256i - 0 1 2 3 4 5 6 7
1651 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1652 //
1653 }
1654 else
1655 {
1656 vGatherResult[0] = VUNDEF_I();
1657 vGatherResult2[0] = VUNDEF_I();
1658 }
1659
1660 // if we have at least one component out of z or w to fetch
1661 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1662 {
1663 // offset base to the next components(zw) in the vertex to gather
1664 pStreamBase = GEP(pStreamBase, C((char)4));
1665
1666 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1667 vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1668 // e.g. result of second 8x32bit integer gather for 16bit components
1669 // 256i - 0 1 2 3 4 5 6 7
1670 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1671 //
1672 }
1673 else
1674 {
1675 vGatherResult[1] = VUNDEF_I();
1676 vGatherResult2[1] = VUNDEF_I();
1677 }
1678
1679 // if we have at least one component to shuffle into place
1680 if (compMask)
1681 {
1682 #if USE_SIMD16_BUILDER
1683 Value *gatherResult[2];
1684
1685 gatherResult[0] = VUNDEF2_I();
1686 gatherResult[1] = VUNDEF2_I();
1687
1688 gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0);
1689 gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
1690
1691 gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0);
1692 gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
1693
1694 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1695
1696 Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1697 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1698
1699 // Shuffle gathered components into place in simdvertex struct
1700 Shuffle16bpcGather2(args); // outputs to vVertexElements ref
1701 #else
1702 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1703 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1704 Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1705 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1706
1707 // Shuffle gathered components into place in simdvertex struct
1708 Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
1709 Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
1710 #endif
1711 }
1712 #else
1713 Value* vGatherResult[2];
1714
1715 // if we have at least one component out of x or y to fetch
1716 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1717 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1718 // e.g. result of first 8x32bit integer gather for 16bit components
1719 // 256i - 0 1 2 3 4 5 6 7
1720 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1721 //
1722 }
1723
1724 // if we have at least one component out of z or w to fetch
1725 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1726 // offset base to the next components(zw) in the vertex to gather
1727 pStreamBase = GEP(pStreamBase, C((char)4));
1728
1729 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1730 // e.g. result of second 8x32bit integer gather for 16bit components
1731 // 256i - 0 1 2 3 4 5 6 7
1732 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1733 //
1734 }
1735
1736 // if we have at least one component to shuffle into place
1737 if(compMask){
1738 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1739 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1740
1741 // Shuffle gathered components into place in simdvertex struct
1742 #if USE_SIMD16_SHADERS
1743 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
1744 #else
1745 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1746 #endif
1747 }
1748 #endif
1749 }
1750 break;
1751 case 32:
1752 {
1753 // Gathered components into place in simdvertex struct
1754 for (uint32_t i = 0; i < 4; i++)
1755 {
1756 if (isComponentEnabled(compMask, i))
1757 {
1758 // if we need to gather the component
1759 if (compCtrl[i] == StoreSrc)
1760 {
1761 #if USE_SIMD16_GATHERS
1762 Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1763 Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1764
1765 if (conversionType == CONVERT_USCALED)
1766 {
1767 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1768 pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty);
1769 }
1770 else if (conversionType == CONVERT_SSCALED)
1771 {
1772 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1773 pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty);
1774 }
1775 else if (conversionType == CONVERT_SFIXED)
1776 {
1777 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1778 pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1779 }
1780
1781 #if USE_SIMD16_BUILDER
1782 // pack adjacent pairs of SIMD8s into SIMD16s
1783 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1784 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0);
1785 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
1786
1787 #else
1788 vVertexElements[currentVertexElement] = pGather;
1789 vVertexElements2[currentVertexElement] = pGather2;
1790
1791 #endif
1792
1793 // e.g. result of a single 8x32bit integer gather for 32bit components
1794 // 256i - 0 1 2 3 4 5 6 7
1795 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1796
1797 currentVertexElement += 1;
1798 #else
1799 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1800
1801 if (conversionType == CONVERT_USCALED)
1802 {
1803 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1804 }
1805 else if (conversionType == CONVERT_SSCALED)
1806 {
1807 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1808 }
1809 else if (conversionType == CONVERT_SFIXED)
1810 {
1811 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1812 }
1813
1814 vVertexElements[currentVertexElement++] = pGather;
1815 // e.g. result of a single 8x32bit integer gather for 32bit components
1816 // 256i - 0 1 2 3 4 5 6 7
1817 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1818 #endif
1819 }
1820 else
1821 {
1822 #if USE_SIMD16_SHADERS
1823 #if USE_SIMD16_GATHERS
1824 #if USE_SIMD16_BUILDER
1825 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1826
1827 #else
1828 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1829 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1830
1831 #endif
1832 currentVertexElement += 1;
1833 #else
1834 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1835 #endif
1836 #else
1837 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1838 #endif
1839 }
1840
1841 if (currentVertexElement > 3)
1842 {
1843 #if USE_SIMD16_GATHERS
1844 #if USE_SIMD16_BUILDER
1845 // store SIMD16s
1846 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1847
1848 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1849
1850 #else
1851 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1852 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1853
1854 #endif
1855 outputElt += 1;
1856 #else
1857 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1858 #endif
1859
1860 // reset to the next vVertexElement to output
1861 currentVertexElement = 0;
1862 }
1863
1864 }
1865
1866 // offset base to the next component in the vertex to gather
1867 pStreamBase = GEP(pStreamBase, C((char)4));
1868 }
1869 }
1870 break;
1871 }
1872 }
1873 }
1874
1875 // if we have a partially filled vVertexElement struct, output it
1876 if (currentVertexElement > 0)
1877 {
1878 #if USE_SIMD16_GATHERS
1879 #if USE_SIMD16_BUILDER
1880 // store SIMD16s
1881 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1882
1883 StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2);
1884
1885 #else
1886 StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
1887 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
1888
1889 #endif
1890 outputElt += 1;
1891 #else
1892 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1893 #endif
1894 }
1895 }
1896
1897 //////////////////////////////////////////////////////////////////////////
1898 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1899 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1900 /// support
1901 /// @param pIndices - pointer to 8 bit indices
1902 /// @param pLastIndex - pointer to last valid index
1903 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1904 {
1905 // can fit 2 16 bit integers per vWidth lane
1906 Value* vIndices = VUNDEF_I();
1907
1908 // store 0 index on stack to be used to conditionally load from if index address is OOB
1909 Value* pZeroIndex = ALLOCA(mInt8Ty);
1910 STORE(C((uint8_t)0), pZeroIndex);
1911
1912 // Load a SIMD of index pointers
1913 for(int64_t lane = 0; lane < mVWidth; lane++)
1914 {
1915 // Calculate the address of the requested index
1916 Value *pIndex = GEP(pIndices, C(lane));
1917
1918 // check if the address is less than the max index,
1919 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1920
1921 // if valid, load the index. if not, load 0 from the stack
1922 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1923 Value *index = LOAD(pValid, "valid index");
1924
1925 // zero extended index to 32 bits and insert into the correct simd lane
1926 index = Z_EXT(index, mInt32Ty);
1927 vIndices = VINSERT(vIndices, index, lane);
1928 }
1929 return vIndices;
1930 }
1931
1932 //////////////////////////////////////////////////////////////////////////
1933 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1934 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1935 /// support
1936 /// @param pIndices - pointer to 16 bit indices
1937 /// @param pLastIndex - pointer to last valid index
1938 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1939 {
1940 // can fit 2 16 bit integers per vWidth lane
1941 Value* vIndices = VUNDEF_I();
1942
1943 // store 0 index on stack to be used to conditionally load from if index address is OOB
1944 Value* pZeroIndex = ALLOCA(mInt16Ty);
1945 STORE(C((uint16_t)0), pZeroIndex);
1946
1947 // Load a SIMD of index pointers
1948 for(int64_t lane = 0; lane < mVWidth; lane++)
1949 {
1950 // Calculate the address of the requested index
1951 Value *pIndex = GEP(pIndices, C(lane));
1952
1953 // check if the address is less than the max index,
1954 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1955
1956 // if valid, load the index. if not, load 0 from the stack
1957 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1958 Value *index = LOAD(pValid, "valid index");
1959
1960 // zero extended index to 32 bits and insert into the correct simd lane
1961 index = Z_EXT(index, mInt32Ty);
1962 vIndices = VINSERT(vIndices, index, lane);
1963 }
1964 return vIndices;
1965 }
1966
1967 //////////////////////////////////////////////////////////////////////////
1968 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1969 /// @param pIndices - pointer to 32 bit indices
1970 /// @param pLastIndex - pointer to last valid index
1971 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1972 {
1973 DataLayout dL(JM()->mpCurrentModule);
1974 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1975 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1976 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1977
1978 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1979 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1980 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1981 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1982
1983 // create a vector of index counts from the base index ptr passed into the fetch
1984 const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1985 Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1986
1987 // compare index count to the max valid index
1988 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1989 // vIndexOffsets 0 1 2 3 4 5 6 7
1990 // ------------------------------
1991 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1992 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1993 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1994 Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1995
1996 // VMASKLOAD takes an *i8 src pointer
1997 pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1998
1999 // Load the indices; OOB loads 0
2000 return MASKLOADD(pIndices,vIndexMask);
2001 }
2002
2003 //////////////////////////////////////////////////////////////////////////
2004 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
2005 /// denormalizes if needed, converts to F32 if needed, and positions in
2006 // the proper SIMD rows to be output to the simdvertex structure
2007 /// @param args: (tuple of args, listed below)
2008 /// @param vGatherResult - 8 gathered 8bpc vertices
2009 /// @param pVtxOut - base pointer to output simdvertex struct
2010 /// @param extendType - sign extend or zero extend
2011 /// @param bNormalized - do we need to denormalize?
2012 /// @param currentVertexElement - reference to the current vVertexElement
2013 /// @param outputElt - reference to the current offset from simdvertex we're o
2014 /// @param compMask - component packing mask
2015 /// @param compCtrl - component control val
2016 /// @param vVertexElements[4] - vertex components to output
2017 /// @param swizzle[4] - component swizzle location
2018 #if USE_SIMD16_SHADERS
2019 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
2020 #else
2021 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
2022 #endif
2023 {
2024 // Unpack tuple args
2025 Value*& vGatherResult = std::get<0>(args);
2026 Value* pVtxOut = std::get<1>(args);
2027 const Instruction::CastOps extendType = std::get<2>(args);
2028 const ConversionType conversionType = std::get<3>(args);
2029 uint32_t &currentVertexElement = std::get<4>(args);
2030 uint32_t &outputElt = std::get<5>(args);
2031 const ComponentEnable compMask = std::get<6>(args);
2032 const ComponentControl (&compCtrl)[4] = std::get<7>(args);
2033 Value* (&vVertexElements)[4] = std::get<8>(args);
2034 const uint32_t (&swizzle)[4] = std::get<9>(args);
2035
2036 // cast types
2037 Type* vGatherTy = mSimdInt32Ty;
2038 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
2039
2040 // have to do extra work for sign extending
2041 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
2042 Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
2043 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2044
2045 // shuffle mask, including any swizzling
2046 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
2047 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
2048 Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
2049 char(y), char(y+4), char(y+8), char(y+12),
2050 char(z), char(z+4), char(z+8), char(z+12),
2051 char(w), char(w+4), char(w+8), char(w+12),
2052 char(x), char(x+4), char(x+8), char(x+12),
2053 char(y), char(y+4), char(y+8), char(y+12),
2054 char(z), char(z+4), char(z+8), char(z+12),
2055 char(w), char(w+4), char(w+8), char(w+12)});
2056
2057 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2058 // after pshufb: group components together in each 128bit lane
2059 // 256i - 0 1 2 3 4 5 6 7
2060 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
2061
2062 Value* vi128XY = nullptr;
2063 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2064 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
2065 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
2066 // 256i - 0 1 2 3 4 5 6 7
2067 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
2068 }
2069
2070 // do the same for zw components
2071 Value* vi128ZW = nullptr;
2072 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2073 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
2074 }
2075
2076 // init denormalize variables if needed
2077 Instruction::CastOps fpCast;
2078 Value* conversionFactor;
2079
2080 switch (conversionType)
2081 {
2082 case CONVERT_NORMALIZED:
2083 fpCast = Instruction::CastOps::SIToFP;
2084 conversionFactor = VIMMED1((float)(1.0 / 127.0));
2085 break;
2086 case CONVERT_SSCALED:
2087 fpCast = Instruction::CastOps::SIToFP;
2088 conversionFactor = VIMMED1((float)(1.0));
2089 break;
2090 case CONVERT_USCALED:
2091 SWR_INVALID("Type should not be sign extended!");
2092 conversionFactor = nullptr;
2093 break;
2094 default:
2095 SWR_ASSERT(conversionType == CONVERT_NONE);
2096 conversionFactor = nullptr;
2097 break;
2098 }
2099
2100 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2101 for (uint32_t i = 0; i < 4; i++)
2102 {
2103 if (isComponentEnabled(compMask, i))
2104 {
2105 if (compCtrl[i] == ComponentControl::StoreSrc)
2106 {
2107 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2108 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2109 // if x or y, use vi128XY permute result, else use vi128ZW
2110 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2111
2112 // sign extend
2113 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
2114
2115 // denormalize if needed
2116 if (conversionType != CONVERT_NONE)
2117 {
2118 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2119 }
2120 currentVertexElement++;
2121 }
2122 else
2123 {
2124 #if USE_SIMD16_SHADERS
2125 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2126 #else
2127 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2128 #endif
2129 }
2130
2131 if (currentVertexElement > 3)
2132 {
2133 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2134 // reset to the next vVertexElement to output
2135 currentVertexElement = 0;
2136 }
2137 }
2138 }
2139 }
2140 // else zero extend
2141 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2142 {
2143 // init denormalize variables if needed
2144 Instruction::CastOps fpCast;
2145 Value* conversionFactor;
2146
2147 switch (conversionType)
2148 {
2149 case CONVERT_NORMALIZED:
2150 fpCast = Instruction::CastOps::UIToFP;
2151 conversionFactor = VIMMED1((float)(1.0 / 255.0));
2152 break;
2153 case CONVERT_USCALED:
2154 fpCast = Instruction::CastOps::UIToFP;
2155 conversionFactor = VIMMED1((float)(1.0));
2156 break;
2157 case CONVERT_SSCALED:
2158 SWR_INVALID("Type should not be zero extended!");
2159 conversionFactor = nullptr;
2160 break;
2161 default:
2162 SWR_ASSERT(conversionType == CONVERT_NONE);
2163 conversionFactor = nullptr;
2164 break;
2165 }
2166
2167 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2168 for (uint32_t i = 0; i < 4; i++)
2169 {
2170 if (isComponentEnabled(compMask, i))
2171 {
2172 if (compCtrl[i] == ComponentControl::StoreSrc)
2173 {
2174 // pshufb masks for each component
2175 Value* vConstMask;
2176 switch (swizzle[i])
2177 {
2178 case 0:
2179 // x shuffle mask
2180 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2181 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2182 break;
2183 case 1:
2184 // y shuffle mask
2185 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2186 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2187 break;
2188 case 2:
2189 // z shuffle mask
2190 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2191 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2192 break;
2193 case 3:
2194 // w shuffle mask
2195 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2196 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2197 break;
2198 default:
2199 vConstMask = nullptr;
2200 break;
2201 }
2202
2203 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2204 // after pshufb for x channel
2205 // 256i - 0 1 2 3 4 5 6 7
2206 // x000 x000 x000 x000 x000 x000 x000 x000
2207
2208 // denormalize if needed
2209 if (conversionType != CONVERT_NONE)
2210 {
2211 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2212 }
2213 currentVertexElement++;
2214 }
2215 else
2216 {
2217 #if USE_SIMD16_SHADERS
2218 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2219 #else
2220 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2221 #endif
2222 }
2223
2224 if (currentVertexElement > 3)
2225 {
2226 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2227 // reset to the next vVertexElement to output
2228 currentVertexElement = 0;
2229 }
2230 }
2231 }
2232 }
2233 else
2234 {
2235 SWR_INVALID("Unsupported conversion type");
2236 }
2237 }
2238
2239 #if USE_SIMD16_BUILDER
2240 void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
2241 {
2242 // Unpack tuple args
2243 Value*& vGatherResult = std::get<0>(args);
2244 Value* pVtxOut = std::get<1>(args);
2245 const Instruction::CastOps extendType = std::get<2>(args);
2246 const ConversionType conversionType = std::get<3>(args);
2247 uint32_t &currentVertexElement = std::get<4>(args);
2248 uint32_t &outputElt = std::get<5>(args);
2249 const ComponentEnable compMask = std::get<6>(args);
2250 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2251 Value* (&vVertexElements)[4] = std::get<8>(args);
2252 const uint32_t(&swizzle)[4] = std::get<9>(args);
2253
2254 // cast types
2255 Type *vGatherTy = mSimdInt32Ty;
2256 Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2257
2258 // have to do extra work for sign extending
2259 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
2260 {
2261 Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
2262 Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2263
2264 // shuffle mask, including any swizzling
2265 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
2266 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
2267 Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
2268 char(y), char(y + 4), char(y + 8), char(y + 12),
2269 char(z), char(z + 4), char(z + 8), char(z + 12),
2270 char(w), char(w + 4), char(w + 8), char(w + 12),
2271 char(x), char(x + 4), char(x + 8), char(x + 12),
2272 char(y), char(y + 4), char(y + 8), char(y + 12),
2273 char(z), char(z + 4), char(z + 8), char(z + 12),
2274 char(w), char(w + 4), char(w + 8), char(w + 12) });
2275
2276 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2277
2278 Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
2279 Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
2280
2281 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2282 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2283
2284 // after pshufb: group components together in each 128bit lane
2285 // 256i - 0 1 2 3 4 5 6 7
2286 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
2287
2288 Value *vi128XY_lo = nullptr;
2289 Value *vi128XY_hi = nullptr;
2290 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2291 {
2292 vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
2293 vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
2294
2295 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
2296 // 256i - 0 1 2 3 4 5 6 7
2297 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
2298 }
2299
2300 // do the same for zw components
2301 Value *vi128ZW_lo = nullptr;
2302 Value *vi128ZW_hi = nullptr;
2303 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2304 {
2305 vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
2306 vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
2307 }
2308
2309 // init denormalize variables if needed
2310 Instruction::CastOps fpCast;
2311 Value *conversionFactor;
2312
2313 switch (conversionType)
2314 {
2315 case CONVERT_NORMALIZED:
2316 fpCast = Instruction::CastOps::SIToFP;
2317 conversionFactor = VIMMED1((float)(1.0 / 127.0));
2318 break;
2319 case CONVERT_SSCALED:
2320 fpCast = Instruction::CastOps::SIToFP;
2321 conversionFactor = VIMMED1((float)(1.0));
2322 break;
2323 case CONVERT_USCALED:
2324 SWR_INVALID("Type should not be sign extended!");
2325 conversionFactor = nullptr;
2326 break;
2327 default:
2328 SWR_ASSERT(conversionType == CONVERT_NONE);
2329 conversionFactor = nullptr;
2330 break;
2331 }
2332
2333 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2334 for (uint32_t i = 0; i < 4; i++)
2335 {
2336 if (isComponentEnabled(compMask, i))
2337 {
2338 if (compCtrl[i] == ComponentControl::StoreSrc)
2339 {
2340 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2341 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2342 // if x or y, use vi128XY permute result, else use vi128ZW
2343 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2344 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2345
2346 // sign extend
2347 Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
2348 Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
2349
2350 // denormalize if needed
2351 if (conversionType != CONVERT_NONE)
2352 {
2353 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2354 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2355 }
2356
2357 vVertexElements[currentVertexElement] = VUNDEF2_F();
2358 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2359 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2360
2361 currentVertexElement += 1;
2362 }
2363 else
2364 {
2365 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2366 }
2367
2368 if (currentVertexElement > 3)
2369 {
2370 StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2371 // reset to the next vVertexElement to output
2372 currentVertexElement = 0;
2373 }
2374 }
2375 }
2376 }
2377 // else zero extend
2378 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2379 {
2380 // init denormalize variables if needed
2381 Instruction::CastOps fpCast;
2382 Value *conversionFactor;
2383
2384 switch (conversionType)
2385 {
2386 case CONVERT_NORMALIZED:
2387 fpCast = Instruction::CastOps::UIToFP;
2388 conversionFactor = VIMMED1((float)(1.0 / 255.0));
2389 break;
2390 case CONVERT_USCALED:
2391 fpCast = Instruction::CastOps::UIToFP;
2392 conversionFactor = VIMMED1((float)(1.0));
2393 break;
2394 case CONVERT_SSCALED:
2395 SWR_INVALID("Type should not be zero extended!");
2396 conversionFactor = nullptr;
2397 break;
2398 default:
2399 SWR_ASSERT(conversionType == CONVERT_NONE);
2400 conversionFactor = nullptr;
2401 break;
2402 }
2403
2404 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2405 for (uint32_t i = 0; i < 4; i++)
2406 {
2407 if (isComponentEnabled(compMask, i))
2408 {
2409 if (compCtrl[i] == ComponentControl::StoreSrc)
2410 {
2411 // pshufb masks for each component
2412 Value *vConstMask;
2413 switch (swizzle[i])
2414 {
2415 case 0:
2416 // x shuffle mask
2417 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2418 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2419 break;
2420 case 1:
2421 // y shuffle mask
2422 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2423 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2424 break;
2425 case 2:
2426 // z shuffle mask
2427 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2428 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2429 break;
2430 case 3:
2431 // w shuffle mask
2432 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2433 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2434 break;
2435 default:
2436 vConstMask = nullptr;
2437 break;
2438 }
2439
2440 Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
2441 Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
2442
2443 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2444 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2445
2446 // after pshufb for x channel
2447 // 256i - 0 1 2 3 4 5 6 7
2448 // x000 x000 x000 x000 x000 x000 x000 x000
2449
2450 // denormalize if needed
2451 if (conversionType != CONVERT_NONE)
2452 {
2453 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2454 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2455 }
2456
2457 vVertexElements[currentVertexElement] = VUNDEF2_F();
2458 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2459 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2460
2461 currentVertexElement += 1;
2462 }
2463 else
2464 {
2465 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2466 }
2467
2468 if (currentVertexElement > 3)
2469 {
2470 StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2471 // reset to the next vVertexElement to output
2472 currentVertexElement = 0;
2473 }
2474 }
2475 }
2476 }
2477 else
2478 {
2479 SWR_INVALID("Unsupported conversion type");
2480 }
2481 }
2482
2483 #endif
2484 //////////////////////////////////////////////////////////////////////////
2485 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2486 /// denormalizes if needed, converts to F32 if needed, and positions in
2487 // the proper SIMD rows to be output to the simdvertex structure
2488 /// @param args: (tuple of args, listed below)
2489 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2490 /// @param pVtxOut - base pointer to output simdvertex struct
2491 /// @param extendType - sign extend or zero extend
2492 /// @param bNormalized - do we need to denormalize?
2493 /// @param currentVertexElement - reference to the current vVertexElement
2494 /// @param outputElt - reference to the current offset from simdvertex we're o
2495 /// @param compMask - component packing mask
2496 /// @param compCtrl - component control val
2497 /// @param vVertexElements[4] - vertex components to output
2498 #if USE_SIMD16_SHADERS
2499 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2500 #else
2501 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2502 #endif
2503 {
2504 // Unpack tuple args
2505 Value* (&vGatherResult)[2] = std::get<0>(args);
2506 Value* pVtxOut = std::get<1>(args);
2507 const Instruction::CastOps extendType = std::get<2>(args);
2508 const ConversionType conversionType = std::get<3>(args);
2509 uint32_t &currentVertexElement = std::get<4>(args);
2510 uint32_t &outputElt = std::get<5>(args);
2511 const ComponentEnable compMask = std::get<6>(args);
2512 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2513 Value* (&vVertexElements)[4] = std::get<8>(args);
2514
2515 // cast types
2516 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2517 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2518
2519 // have to do extra work for sign extending
2520 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
2521 (extendType == Instruction::CastOps::FPExt))
2522 {
2523 // is this PP float?
2524 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2525
2526 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2527 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2528
2529 // shuffle mask
2530 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2531 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
2532 Value* vi128XY = nullptr;
2533 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2534 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2535 // after pshufb: group components together in each 128bit lane
2536 // 256i - 0 1 2 3 4 5 6 7
2537 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2538
2539 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2540 // after PERMD: move and pack xy components into each 128bit lane
2541 // 256i - 0 1 2 3 4 5 6 7
2542 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2543 }
2544
2545 // do the same for zw components
2546 Value* vi128ZW = nullptr;
2547 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2548 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2549 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2550 }
2551
2552 // init denormalize variables if needed
2553 Instruction::CastOps IntToFpCast;
2554 Value* conversionFactor;
2555
2556 switch (conversionType)
2557 {
2558 case CONVERT_NORMALIZED:
2559 IntToFpCast = Instruction::CastOps::SIToFP;
2560 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2561 break;
2562 case CONVERT_SSCALED:
2563 IntToFpCast = Instruction::CastOps::SIToFP;
2564 conversionFactor = VIMMED1((float)(1.0));
2565 break;
2566 case CONVERT_USCALED:
2567 SWR_INVALID("Type should not be sign extended!");
2568 conversionFactor = nullptr;
2569 break;
2570 default:
2571 SWR_ASSERT(conversionType == CONVERT_NONE);
2572 conversionFactor = nullptr;
2573 break;
2574 }
2575
2576 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2577 for (uint32_t i = 0; i < 4; i++)
2578 {
2579 if (isComponentEnabled(compMask, i))
2580 {
2581 if (compCtrl[i] == ComponentControl::StoreSrc)
2582 {
2583 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2584 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2585 // if x or y, use vi128XY permute result, else use vi128ZW
2586 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2587
2588 if (bFP) {
2589 // extract 128 bit lanes to sign extend each component
2590 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2591 }
2592 else {
2593 // extract 128 bit lanes to sign extend each component
2594 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2595
2596 // denormalize if needed
2597 if (conversionType != CONVERT_NONE) {
2598 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2599 }
2600 }
2601 currentVertexElement++;
2602 }
2603 else
2604 {
2605 #if USE_SIMD16_SHADERS
2606 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2607 #else
2608 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2609 #endif
2610 }
2611
2612 if (currentVertexElement > 3)
2613 {
2614 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2615 // reset to the next vVertexElement to output
2616 currentVertexElement = 0;
2617 }
2618 }
2619 }
2620 }
2621 // else zero extend
2622 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2623 {
2624 // pshufb masks for each component
2625 Value* vConstMask[2];
2626 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
2627 // x/z shuffle mask
2628 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2629 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2630 }
2631
2632 if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
2633 // y/w shuffle mask
2634 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2635 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2636 }
2637
2638 // init denormalize variables if needed
2639 Instruction::CastOps fpCast;
2640 Value* conversionFactor;
2641
2642 switch (conversionType)
2643 {
2644 case CONVERT_NORMALIZED:
2645 fpCast = Instruction::CastOps::UIToFP;
2646 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2647 break;
2648 case CONVERT_USCALED:
2649 fpCast = Instruction::CastOps::UIToFP;
2650 conversionFactor = VIMMED1((float)(1.0f));
2651 break;
2652 case CONVERT_SSCALED:
2653 SWR_INVALID("Type should not be zero extended!");
2654 conversionFactor = nullptr;
2655 break;
2656 default:
2657 SWR_ASSERT(conversionType == CONVERT_NONE);
2658 conversionFactor = nullptr;
2659 break;
2660 }
2661
2662 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2663 for (uint32_t i = 0; i < 4; i++)
2664 {
2665 if (isComponentEnabled(compMask, i))
2666 {
2667 if (compCtrl[i] == ComponentControl::StoreSrc)
2668 {
2669 // select correct constMask for x/z or y/w pshufb
2670 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2671 // if x or y, use vi128XY permute result, else use vi128ZW
2672 uint32_t selectedGather = (i < 2) ? 0 : 1;
2673
2674 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2675 // after pshufb mask for x channel; z uses the same shuffle from the second gather
2676 // 256i - 0 1 2 3 4 5 6 7
2677 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2678
2679 // denormalize if needed
2680 if (conversionType != CONVERT_NONE)
2681 {
2682 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2683 }
2684 currentVertexElement++;
2685 }
2686 else
2687 {
2688 #if USE_SIMD16_SHADERS
2689 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2690 #else
2691 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2692 #endif
2693 }
2694
2695 if (currentVertexElement > 3)
2696 {
2697 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2698 // reset to the next vVertexElement to output
2699 currentVertexElement = 0;
2700 }
2701 }
2702 }
2703 }
2704 else
2705 {
2706 SWR_INVALID("Unsupported conversion type");
2707 }
2708 }
2709
2710 #if USE_SIMD16_BUILDER
2711 void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
2712 {
2713 // Unpack tuple args
2714 Value* (&vGatherResult)[2] = std::get<0>(args);
2715 Value* pVtxOut = std::get<1>(args);
2716 const Instruction::CastOps extendType = std::get<2>(args);
2717 const ConversionType conversionType = std::get<3>(args);
2718 uint32_t &currentVertexElement = std::get<4>(args);
2719 uint32_t &outputElt = std::get<5>(args);
2720 const ComponentEnable compMask = std::get<6>(args);
2721 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2722 Value* (&vVertexElements)[4] = std::get<8>(args);
2723
2724 // cast types
2725 Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2726 Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2727
2728 // have to do extra work for sign extending
2729 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
2730 {
2731 // is this PP float?
2732 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2733
2734 Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2735 Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2736
2737 // shuffle mask
2738 Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2739 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2740 Value *vi128XY = nullptr;
2741 Value *vi128XY_lo = nullptr;
2742 Value *vi128XY_hi = nullptr;
2743 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2744 {
2745 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2746
2747 Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[0], 0);
2748 Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[0], 1);
2749
2750 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2751 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2752
2753 // after pshufb: group components together in each 128bit lane
2754 // 256i - 0 1 2 3 4 5 6 7
2755 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2756
2757 vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2758 vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2759
2760 // after PERMD: move and pack xy components into each 128bit lane
2761 // 256i - 0 1 2 3 4 5 6 7
2762 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2763 #if 0
2764
2765 vi128XY = VUNDEF2_I();
2766 vi128XY = INSERT2_I(vi128XY, vi128XY_lo, 0);
2767 vi128XY = INSERT2_I(vi128XY, vi128XY_hi, 1);
2768 #endif
2769 }
2770
2771 // do the same for zw components
2772 Value *vi128ZW = nullptr;
2773 Value *vi128ZW_lo = nullptr;
2774 Value *vi128ZW_hi = nullptr;
2775 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2776 {
2777 Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[1], 0);
2778 Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[1], 1);
2779
2780 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2781 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2782
2783 vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2784 vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2785 #if 0
2786
2787 vi128ZW = VUNDEF2_I();
2788 vi128ZW = INSERT2_I(vi128ZW, vi128ZW_lo, 0);
2789 vi128ZW = INSERT2_I(vi128ZW, vi128ZW_hi, 1);
2790 #endif
2791 }
2792
2793 // init denormalize variables if needed
2794 Instruction::CastOps IntToFpCast;
2795 Value *conversionFactor;
2796
2797 switch (conversionType)
2798 {
2799 case CONVERT_NORMALIZED:
2800 IntToFpCast = Instruction::CastOps::SIToFP;
2801 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2802 break;
2803 case CONVERT_SSCALED:
2804 IntToFpCast = Instruction::CastOps::SIToFP;
2805 conversionFactor = VIMMED1((float)(1.0));
2806 break;
2807 case CONVERT_USCALED:
2808 SWR_INVALID("Type should not be sign extended!");
2809 conversionFactor = nullptr;
2810 break;
2811 default:
2812 SWR_ASSERT(conversionType == CONVERT_NONE);
2813 conversionFactor = nullptr;
2814 break;
2815 }
2816
2817 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2818 for (uint32_t i = 0; i < 4; i++)
2819 {
2820 if (isComponentEnabled(compMask, i))
2821 {
2822 if (compCtrl[i] == ComponentControl::StoreSrc)
2823 {
2824 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2825 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2826 // if x or y, use vi128XY permute result, else use vi128ZW
2827 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2828 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2829
2830 if (bFP)
2831 {
2832 // extract 128 bit lanes to sign extend each component
2833 Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2834 Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2835
2836 vVertexElements[currentVertexElement] = VUNDEF2_F();
2837 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2838 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2839 }
2840 else
2841 {
2842 // extract 128 bit lanes to sign extend each component
2843 Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2844 Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2845
2846 // denormalize if needed
2847 if (conversionType != CONVERT_NONE)
2848 {
2849 temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2850 temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2851 }
2852
2853 vVertexElements[currentVertexElement] = VUNDEF2_F();
2854 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2855 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2856 }
2857
2858 currentVertexElement += 1;
2859 }
2860 else
2861 {
2862 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2863 }
2864
2865 if (currentVertexElement > 3)
2866 {
2867 StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2868 // reset to the next vVertexElement to output
2869 currentVertexElement = 0;
2870 }
2871 }
2872 }
2873 }
2874 // else zero extend
2875 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2876 {
2877 // pshufb masks for each component
2878 Value *vConstMask[2];
2879
2880 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2881 {
2882 // x/z shuffle mask
2883 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2884 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2885 }
2886
2887 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2888 {
2889 // y/w shuffle mask
2890 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2891 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2892 }
2893
2894 // init denormalize variables if needed
2895 Instruction::CastOps fpCast;
2896 Value* conversionFactor;
2897
2898 switch (conversionType)
2899 {
2900 case CONVERT_NORMALIZED:
2901 fpCast = Instruction::CastOps::UIToFP;
2902 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2903 break;
2904 case CONVERT_USCALED:
2905 fpCast = Instruction::CastOps::UIToFP;
2906 conversionFactor = VIMMED1((float)(1.0f));
2907 break;
2908 case CONVERT_SSCALED:
2909 SWR_INVALID("Type should not be zero extended!");
2910 conversionFactor = nullptr;
2911 break;
2912 default:
2913 SWR_ASSERT(conversionType == CONVERT_NONE);
2914 conversionFactor = nullptr;
2915 break;
2916 }
2917
2918 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2919 for (uint32_t i = 0; i < 4; i++)
2920 {
2921 if (isComponentEnabled(compMask, i))
2922 {
2923 if (compCtrl[i] == ComponentControl::StoreSrc)
2924 {
2925 // select correct constMask for x/z or y/w pshufb
2926 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2927 // if x or y, use vi128XY permute result, else use vi128ZW
2928 uint32_t selectedGather = (i < 2) ? 0 : 1;
2929
2930 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2931
2932 Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[selectedGather], 0);
2933 Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[selectedGather], 1);
2934
2935 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2936 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2937
2938 // after pshufb mask for x channel; z uses the same shuffle from the second gather
2939 // 256i - 0 1 2 3 4 5 6 7
2940 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2941
2942 // denormalize if needed
2943 if (conversionType != CONVERT_NONE)
2944 {
2945 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2946 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2947 }
2948
2949 vVertexElements[currentVertexElement] = VUNDEF2_F();
2950 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2951 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2952
2953 currentVertexElement += 1;
2954 }
2955 else
2956 {
2957 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2958 }
2959
2960 if (currentVertexElement > 3)
2961 {
2962 StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2963 // reset to the next vVertexElement to output
2964 currentVertexElement = 0;
2965 }
2966 }
2967 }
2968 }
2969 else
2970 {
2971 SWR_INVALID("Unsupported conversion type");
2972 }
2973 }
2974
2975 #endif
2976 //////////////////////////////////////////////////////////////////////////
2977 /// @brief Output a simdvertex worth of elements to the current outputElt
2978 /// @param pVtxOut - base address of VIN output struct
2979 /// @param outputElt - simdvertex offset in VIN to write to
2980 /// @param numEltsToStore - number of simdvertex rows to write out
2981 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2982 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2983 {
2984 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2985
2986 for(uint32_t c = 0; c < numEltsToStore; ++c)
2987 {
2988 // STORE expects FP32 x vWidth type, just bitcast if needed
2989 if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2990 {
2991 #if FETCH_DUMP_VERTEX
2992 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2993 #endif
2994 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2995 }
2996 #if FETCH_DUMP_VERTEX
2997 else
2998 {
2999 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
3000 }
3001 #endif
3002 // outputElt * 4 = offsetting by the size of a simdvertex
3003 // + c offsets to a 32bit x vWidth row within the current vertex
3004 #if USE_SIMD16_SHADERS
3005 Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
3006 #else
3007 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
3008 #endif
3009 STORE(vVertexElements[c], dest);
3010 }
3011 }
3012
3013 #if USE_SIMD16_BUILDER
3014 void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
3015 {
3016 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
3017
3018 for (uint32_t c = 0; c < numEltsToStore; ++c)
3019 {
3020 // STORE expects FP32 x vWidth type, just bitcast if needed
3021 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
3022 {
3023 #if FETCH_DUMP_VERTEX
3024 PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
3025 #endif
3026 vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty);
3027 }
3028 #if FETCH_DUMP_VERTEX
3029 else
3030 {
3031 PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
3032 }
3033 #endif
3034 // outputElt * 4 = offsetting by the size of a simdvertex
3035 // + c offsets to a 32bit x vWidth row within the current vertex
3036 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
3037 STORE(vVertexElements[c], dest);
3038 }
3039 }
3040
3041 #endif
3042 //////////////////////////////////////////////////////////////////////////
3043 /// @brief Generates a constant vector of values based on the
3044 /// ComponentControl value
3045 /// @param ctrl - ComponentControl value
3046 #if USE_SIMD16_SHADERS
3047 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
3048 #else
3049 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
3050 #endif
3051 {
3052 switch(ctrl)
3053 {
3054 case NoStore: return VUNDEF_I();
3055 case Store0: return VIMMED1(0);
3056 case Store1Fp: return VIMMED1(1.0f);
3057 case Store1Int: return VIMMED1(1);
3058 case StoreVertexId:
3059 {
3060 #if USE_SIMD16_SHADERS
3061 Value* pId;
3062 if (useVertexID2)
3063 {
3064 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
3065 }
3066 else
3067 {
3068 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
3069 }
3070 #else
3071 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
3072 #endif
3073 return VBROADCAST(pId);
3074 }
3075 case StoreInstanceId:
3076 {
3077 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
3078 return VBROADCAST(pId);
3079 }
3080 case StoreSrc:
3081 default: SWR_INVALID("Invalid component control"); return VUNDEF_I();
3082 }
3083 }
3084
3085 #if USE_SIMD16_BUILDER
3086 Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl)
3087 {
3088 switch (ctrl)
3089 {
3090 case NoStore: return VUNDEF2_I();
3091 case Store0: return VIMMED2_1(0);
3092 case Store1Fp: return VIMMED2_1(1.0f);
3093 case Store1Int: return VIMMED2_1(1);
3094 case StoreVertexId:
3095 {
3096 Value* pId = VUNDEF2_F();
3097
3098 Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
3099 Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
3100
3101 pId = INSERT2_F(pId, pId_lo, 0);
3102 pId = INSERT2_F(pId, pId_hi, 1);
3103
3104 return VBROADCAST2(pId);
3105 }
3106 case StoreInstanceId:
3107 {
3108 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
3109 return VBROADCAST2(pId);
3110 }
3111 case StoreSrc:
3112 default: SWR_INVALID("Invalid component control"); return VUNDEF2_I();
3113 }
3114 }
3115
3116 #endif
3117 //////////////////////////////////////////////////////////////////////////
3118 /// @brief Returns the enable mask for the specified component.
3119 /// @param enableMask - enable bits
3120 /// @param component - component to check if enabled.
3121 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
3122 {
3123 switch (component)
3124 {
3125 // X
3126 case 0: return (enableMask & ComponentEnable::X);
3127 // Y
3128 case 1: return (enableMask & ComponentEnable::Y);
3129 // Z
3130 case 2: return (enableMask & ComponentEnable::Z);
3131 // W
3132 case 3: return (enableMask & ComponentEnable::W);
3133
3134 default: return false;
3135 }
3136 }
3137
3138
3139 //////////////////////////////////////////////////////////////////////////
3140 /// @brief JITs from fetch shader IR
3141 /// @param hJitMgr - JitManager handle
3142 /// @param func - LLVM function IR
3143 /// @return PFN_FETCH_FUNC - pointer to fetch code
3144 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
3145 {
3146 const llvm::Function* func = (const llvm::Function*)hFunc;
3147 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
3148 PFN_FETCH_FUNC pfnFetch;
3149
3150 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
3151 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
3152 pJitMgr->mIsModuleFinalized = true;
3153
3154 #if defined(KNOB_SWRC_TRACING)
3155 char fName[1024];
3156 const char *funcName = func->getName().data();
3157 sprintf(fName, "%s.bin", funcName);
3158 FILE *fd = fopen(fName, "wb");
3159 fwrite((void *)pfnFetch, 1, 2048, fd);
3160 fclose(fd);
3161 #endif
3162
3163 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
3164
3165 return pfnFetch;
3166 }
3167
3168 //////////////////////////////////////////////////////////////////////////
3169 /// @brief JIT compiles fetch shader
3170 /// @param hJitMgr - JitManager handle
3171 /// @param state - fetch state to build function from
3172 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
3173 {
3174 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
3175
3176 pJitMgr->SetupNewModule();
3177
3178 FetchJit theJit(pJitMgr);
3179 HANDLE hFunc = theJit.Create(state);
3180
3181 return JitFetchFunc(hJitMgr, hFunc);
3182 }