67a4a0407267831449576efc254815d169582cbf
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "jit_api.h"
32 #include "fetch_jit.h"
33 #include "gen_state_llvm.h"
34 #include <sstream>
35 #include <tuple>
36
37 //#define FETCH_DUMP_VERTEX 1
38 using namespace llvm;
39 using namespace SwrJit;
40
41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43 enum ConversionType
44 {
45 CONVERT_NONE,
46 CONVERT_NORMALIZED,
47 CONVERT_USCALED,
48 CONVERT_SSCALED,
49 CONVERT_SFIXED,
50 };
51
52 //////////////////////////////////////////////////////////////////////////
53 /// Interface to Jitting a fetch shader
54 //////////////////////////////////////////////////////////////////////////
55 struct FetchJit : public Builder
56 {
57 FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
58
59 Function* Create(const FETCH_COMPILE_STATE& fetchState);
60 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
61 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
62 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
63
64 // package up Shuffle*bpcGatherd args into a tuple for convenience
65 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
66 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
67 const uint32_t(&)[4]> Shuffle8bpcArgs;
68 #if USE_SIMD16_SHADERS
69 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
70 #else
71 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
72 #endif
73 #if USE_SIMD16_BUILDER
74 void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args);
75 #endif
76
77 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
78 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
79 #if USE_SIMD16_SHADERS
80 void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
81 #else
82 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
83 #endif
84 #if USE_SIMD16_BUILDER
85 void Shuffle16bpcGather2(Shuffle16bpcArgs &args);
86 #endif
87
88 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
89 #if USE_SIMD16_BUILDER
90 void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
91 #endif
92
93 #if USE_SIMD16_SHADERS
94 Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
95 #else
96 Value* GenerateCompCtrlVector(const ComponentControl ctrl);
97 #endif
98 #if USE_SIMD16_BUILDER
99 Value* GenerateCompCtrlVector2(const ComponentControl ctrl);
100 #endif
101
102 void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
103 #if USE_SIMD16_SHADERS
104 #define USE_SIMD16_GATHERS 0
105
106 #if USE_SIMD16_GATHERS
107 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
108 #else
109 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
110 #endif
111 #else
112 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
113 #endif
114
115 bool IsOddFormat(SWR_FORMAT format);
116 bool IsUniformFormat(SWR_FORMAT format);
117 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
118 void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
119 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
120
121 Value* mpFetchInfo;
122 };
123
124 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
125 {
126 std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
127 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
128
129 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
130 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
131
132 fetch->getParent()->setModuleIdentifier(fetch->getName());
133
134 IRB()->SetInsertPoint(entry);
135
136 auto argitr = fetch->arg_begin();
137
138 // Fetch shader arguments
139 mpFetchInfo = &*argitr; ++argitr;
140 mpFetchInfo->setName("fetchInfo");
141 Value* pVtxOut = &*argitr;
142 pVtxOut->setName("vtxOutput");
143 // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
144 // index 0(just the pointer to the simdvertex structure
145 // index 1(which element of the simdvertex structure to offset to(in this case 0)
146 // so the indices being i32's doesn't matter
147 // TODO: generated this GEP with a VECTOR structure type so this makes sense
148 std::vector<Value*> vtxInputIndices(2, C(0));
149 // GEP
150 pVtxOut = GEP(pVtxOut, C(0));
151 #if USE_SIMD16_SHADERS
152 #if 0// USE_SIMD16_BUILDER
153 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
154 #else
155 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
156 #endif
157 #else
158 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
159 #endif
160
161 // SWR_FETCH_CONTEXT::pStreams
162 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
163 streams->setName("pStreams");
164
165 // SWR_FETCH_CONTEXT::pIndices
166 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
167 indices->setName("pIndices");
168
169 // SWR_FETCH_CONTEXT::pLastIndex
170 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
171 pLastIndex->setName("pLastIndex");
172
173
174 Value* vIndices;
175 #if USE_SIMD16_SHADERS
176 Value* indices2;
177 Value* vIndices2;
178 #endif
179 switch(fetchState.indexType)
180 {
181 case R8_UINT:
182 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
183 #if USE_SIMD16_SHADERS
184 indices2 = GEP(indices, C(8));
185 #endif
186 if(fetchState.bDisableIndexOOBCheck)
187 {
188 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
189 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
190 #if USE_SIMD16_SHADERS
191 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
192 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
193 #endif
194 }
195 else
196 {
197 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
198 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
199 #if USE_SIMD16_SHADERS
200 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
201 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
202 #endif
203 }
204 break;
205 case R16_UINT:
206 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
207 #if USE_SIMD16_SHADERS
208 indices2 = GEP(indices, C(8));
209 #endif
210 if(fetchState.bDisableIndexOOBCheck)
211 {
212 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
213 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
214 #if USE_SIMD16_SHADERS
215 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
216 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
217 #endif
218 }
219 else
220 {
221 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
222 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
223 #if USE_SIMD16_SHADERS
224 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
225 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
226 #endif
227 }
228 break;
229 case R32_UINT:
230 #if USE_SIMD16_SHADERS
231 indices2 = GEP(indices, C(8));
232 #endif
233 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
234 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
235 #if USE_SIMD16_SHADERS
236 (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
237 : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
238 #endif
239 break; // incoming type is already 32bit int
240 default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
241 }
242
243 if(fetchState.bForceSequentialAccessEnable)
244 {
245 Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
246
247 // VertexData buffers are accessed sequentially, the index is equal to the vertex number
248 vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
249 vIndices = ADD(vIndices, pOffsets);
250 #if USE_SIMD16_SHADERS
251 vIndices2 = ADD(vIndices, VIMMED1(8));
252 #endif
253 }
254
255 Value* vVertexId = vIndices;
256 #if USE_SIMD16_SHADERS
257 Value* vVertexId2 = vIndices2;
258 #endif
259 if (fetchState.bVertexIDOffsetEnable)
260 {
261 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
262 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
263 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
264 vVertexId = ADD(vIndices, vBaseVertex);
265 vVertexId = ADD(vVertexId, vStartVertex);
266 #if USE_SIMD16_SHADERS
267 vVertexId2 = ADD(vIndices2, vBaseVertex);
268 vVertexId2 = ADD(vVertexId2, vStartVertex);
269 #endif
270 }
271
272 // store out vertex IDs
273 STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
274 #if USE_SIMD16_SHADERS
275 STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
276 #endif
277
278 // store out cut mask if enabled
279 if (fetchState.bEnableCutIndex)
280 {
281 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
282 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
283 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
284 #if USE_SIMD16_SHADERS
285 Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
286 STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
287 #endif
288 }
289
290 // Fetch attributes from memory and output to a simdvertex struct
291 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
292 #if USE_SIMD16_SHADERS
293 if (fetchState.bDisableVGATHER)
294 {
295 JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
296 JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
297 }
298 else
299 {
300 #if USE_SIMD16_GATHERS
301 JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
302 #else
303 JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
304 JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
305 #endif
306 }
307 #else
308 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
309 : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
310 #endif
311
312 RET_VOID();
313
314 JitManager::DumpToFile(fetch, "src");
315
316 #if defined(_DEBUG)
317 verifyFunction(*fetch);
318 #endif
319
320 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
321
322 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
323 setupPasses.add(createBreakCriticalEdgesPass());
324 setupPasses.add(createCFGSimplificationPass());
325 setupPasses.add(createEarlyCSEPass());
326 setupPasses.add(createPromoteMemoryToRegisterPass());
327
328 setupPasses.run(*fetch);
329
330 JitManager::DumpToFile(fetch, "se");
331
332 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
333
334 ///@todo Haven't touched these either. Need to remove some of these and add others.
335 optPasses.add(createCFGSimplificationPass());
336 optPasses.add(createEarlyCSEPass());
337 optPasses.add(createInstructionCombiningPass());
338 optPasses.add(createInstructionSimplifierPass());
339 optPasses.add(createConstantPropagationPass());
340 optPasses.add(createSCCPPass());
341 optPasses.add(createAggressiveDCEPass());
342
343 optPasses.run(*fetch);
344 optPasses.run(*fetch);
345
346 JitManager::DumpToFile(fetch, "opt");
347
348 return fetch;
349 }
350
351 //////////////////////////////////////////////////////////////////////////
352 /// @brief Loads attributes from memory using LOADs, shuffling the
353 /// components into SOA form.
354 /// *Note* currently does not support component control,
355 /// component packing, instancing
356 /// @param fetchState - info about attributes to be fetched from memory
357 /// @param streams - value pointer to the current vertex stream
358 /// @param vIndices - vector value of indices to load
359 /// @param pVtxOut - value pointer to output simdvertex struct
360 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
361 {
362 // Zack shuffles; a variant of the Charleston.
363
364 std::vector<Value*> vectors(16);
365 std::vector<Constant*> pMask(mVWidth);
366 for(uint32_t i = 0; i < mVWidth; ++i)
367 {
368 pMask[i] = (C(i < 4 ? i : 4));
369 }
370 Constant* promoteMask = ConstantVector::get(pMask);
371 Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
372
373 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
374 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
375 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
376 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
377 curInstance->setName("curInstance");
378
379 for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
380 {
381 Value* elements[4] = {0};
382 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
383 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
384 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
385 uint32_t numComponents = info.numComps;
386 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
387
388 // load path doesn't support component packing
389 SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
390
391 vectors.clear();
392
393 if (fetchState.bInstanceIDOffsetEnable)
394 {
395 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
396 }
397
398 Value *vCurIndices;
399 Value *startOffset;
400 if(ied.InstanceEnable)
401 {
402 Value* stepRate = C(ied.InstanceAdvancementState);
403
404 // prevent a div by 0 for 0 step rate
405 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
406 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
407
408 // calc the current offset into instanced data buffer
409 Value* calcInstance = UDIV(curInstance, stepRate);
410
411 // if step rate is 0, every instance gets instance 0
412 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
413
414 vCurIndices = VBROADCAST(calcInstance);
415
416 startOffset = startInstance;
417 }
418 else if (ied.InstanceStrideEnable)
419 {
420 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
421 }
422 else
423 {
424 // offset indices by baseVertex
425 vCurIndices = ADD(vIndices, vBaseVertex);
426
427 startOffset = startVertex;
428 }
429
430 // load SWR_VERTEX_BUFFER_STATE::pData
431 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
432
433 // load SWR_VERTEX_BUFFER_STATE::pitch
434 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
435 stride = Z_EXT(stride, mInt64Ty);
436
437 // load SWR_VERTEX_BUFFER_STATE::size
438 Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
439 size = Z_EXT(size, mInt64Ty);
440
441 Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
442
443 Value *minVertex = NULL;
444 Value *minVertexOffset = NULL;
445 if (fetchState.bPartialVertexBuffer) {
446 // fetch min index for low bounds checking
447 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
448 minVertex = LOAD(minVertex);
449 if (!fetchState.bDisableIndexOOBCheck) {
450 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
451 }
452 }
453
454 // Load from the stream.
455 for(uint32_t lane = 0; lane < mVWidth; ++lane)
456 {
457 // Get index
458 Value* index = VEXTRACT(vCurIndices, C(lane));
459
460 if (fetchState.bPartialVertexBuffer) {
461 // clamp below minvertex
462 Value *isBelowMin = ICMP_SLT(index, minVertex);
463 index = SELECT(isBelowMin, minVertex, index);
464 }
465
466 index = Z_EXT(index, mInt64Ty);
467
468 Value* offset = MUL(index, stride);
469 offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
470 offset = ADD(offset, startVertexOffset);
471
472 if (!fetchState.bDisableIndexOOBCheck) {
473 // check for out of bound access, including partial OOB, and replace them with minVertex
474 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
475 Value *oob = ICMP_ULE(endOffset, size);
476 if (fetchState.bPartialVertexBuffer) {
477 offset = SELECT(oob, offset, minVertexOffset);
478 } else {
479 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
480 }
481 }
482
483 Value* pointer = GEP(stream, offset);
484 // We use a full-lane, but don't actually care.
485 Value* vptr = 0;
486
487 // get a pointer to a 4 component attrib in default address space
488 switch(bpc)
489 {
490 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
491 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
492 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
493 default: SWR_INVALID("Unsupported underlying bpp!");
494 }
495
496 // load 4 components of attribute
497 Value* vec = ALIGNED_LOAD(vptr, 1, false);
498
499 // Convert To FP32 internally
500 switch(info.type[0])
501 {
502 case SWR_TYPE_UNORM:
503 switch(bpc)
504 {
505 case 8:
506 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
507 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
508 break;
509 case 16:
510 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
511 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
512 break;
513 default:
514 SWR_INVALID("Unsupported underlying type!");
515 break;
516 }
517 break;
518 case SWR_TYPE_SNORM:
519 switch(bpc)
520 {
521 case 8:
522 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
523 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
524 break;
525 case 16:
526 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
527 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
528 break;
529 default:
530 SWR_INVALID("Unsupported underlying type!");
531 break;
532 }
533 break;
534 case SWR_TYPE_UINT:
535 // Zero extend uint32_t types.
536 switch(bpc)
537 {
538 case 8:
539 case 16:
540 vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
541 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
542 break;
543 case 32:
544 break; // Pass through unchanged.
545 default:
546 SWR_INVALID("Unsupported underlying type!");
547 break;
548 }
549 break;
550 case SWR_TYPE_SINT:
551 // Sign extend SINT types.
552 switch(bpc)
553 {
554 case 8:
555 case 16:
556 vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
557 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
558 break;
559 case 32:
560 break; // Pass through unchanged.
561 default:
562 SWR_INVALID("Unsupported underlying type!");
563 break;
564 }
565 break;
566 case SWR_TYPE_FLOAT:
567 switch(bpc)
568 {
569 case 32:
570 break; // Pass through unchanged.
571 default:
572 SWR_INVALID("Unsupported underlying type!");
573 }
574 break;
575 case SWR_TYPE_USCALED:
576 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
577 break;
578 case SWR_TYPE_SSCALED:
579 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
580 break;
581 case SWR_TYPE_SFIXED:
582 vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
583 break;
584 case SWR_TYPE_UNKNOWN:
585 case SWR_TYPE_UNUSED:
586 SWR_INVALID("Unsupported type %d!", info.type[0]);
587 }
588
589 // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
590 // uwvec: 4 x F32, undef value
591 Value* wvec = VSHUFFLE(vec, uwvec, promoteMask);
592 vectors.push_back(wvec);
593 }
594
595 std::vector<Constant*> v01Mask(mVWidth);
596 std::vector<Constant*> v23Mask(mVWidth);
597 std::vector<Constant*> v02Mask(mVWidth);
598 std::vector<Constant*> v13Mask(mVWidth);
599
600 // Concatenate the vectors together.
601 elements[0] = VUNDEF_F();
602 elements[1] = VUNDEF_F();
603 elements[2] = VUNDEF_F();
604 elements[3] = VUNDEF_F();
605 for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
606 {
607 v01Mask[4 * b + 0] = C(0 + 4 * b);
608 v01Mask[4 * b + 1] = C(1 + 4 * b);
609 v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
610 v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
611
612 v23Mask[4 * b + 0] = C(2 + 4 * b);
613 v23Mask[4 * b + 1] = C(3 + 4 * b);
614 v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
615 v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
616
617 v02Mask[4 * b + 0] = C(0 + 4 * b);
618 v02Mask[4 * b + 1] = C(2 + 4 * b);
619 v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
620 v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
621
622 v13Mask[4 * b + 0] = C(1 + 4 * b);
623 v13Mask[4 * b + 1] = C(3 + 4 * b);
624 v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
625 v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
626
627 std::vector<Constant*> iMask(mVWidth);
628 for(uint32_t i = 0; i < mVWidth; ++i)
629 {
630 if(((4 * b) <= i) && (i < (4 * (b + 1))))
631 {
632 iMask[i] = C(i % 4 + mVWidth);
633 }
634 else
635 {
636 iMask[i] = C(i);
637 }
638 }
639 Constant* insertMask = ConstantVector::get(iMask);
640 elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
641 elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
642 elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
643 elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
644 }
645
646 Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
647 Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
648 Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
649 Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
650 elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
651 elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
652 elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
653 elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
654
655 switch(numComponents + 1)
656 {
657 case 1: elements[0] = VIMMED1(0.0f);
658 case 2: elements[1] = VIMMED1(0.0f);
659 case 3: elements[2] = VIMMED1(0.0f);
660 case 4: elements[3] = VIMMED1(1.0f);
661 }
662
663 for(uint32_t c = 0; c < 4; ++c)
664 {
665 #if USE_SIMD16_SHADERS
666 Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
667 #else
668 Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
669 #endif
670 STORE(elements[c], dest);
671 }
672 }
673 }
674
675 // returns true for odd formats that require special state.gather handling
676 bool FetchJit::IsOddFormat(SWR_FORMAT format)
677 {
678 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
679 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
680 {
681 return true;
682 }
683 return false;
684 }
685
686 // format is uniform if all components are the same size and type
687 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
688 {
689 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
690 uint32_t bpc0 = info.bpc[0];
691 uint32_t type0 = info.type[0];
692
693 for (uint32_t c = 1; c < info.numComps; ++c)
694 {
695 if (bpc0 != info.bpc[c] || type0 != info.type[c])
696 {
697 return false;
698 }
699 }
700 return true;
701 }
702
703 // unpacks components based on format
704 // foreach component in the pixel
705 // mask off everything but this component
706 // shift component to LSB
707 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
708 {
709 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
710
711 uint32_t bitOffset = 0;
712 for (uint32_t c = 0; c < info.numComps; ++c)
713 {
714 uint32_t swizzledIndex = info.swizzle[c];
715 uint32_t compBits = info.bpc[c];
716 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
717 Value* comp = AND(vInput, bitmask);
718 comp = LSHR(comp, bitOffset);
719
720 result[swizzledIndex] = comp;
721 bitOffset += compBits;
722 }
723 }
724
725 // gather for odd component size formats
726 // gather SIMD full pixels per lane then shift/mask to move each component to their
727 // own vector
728 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
729 {
730 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
731
732 // only works if pixel size is <= 32bits
733 SWR_ASSERT(info.bpp <= 32);
734
735 Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
736
737 for (uint32_t comp = 0; comp < 4; ++comp)
738 {
739 pResult[comp] = VIMMED1((int)info.defaults[comp]);
740 }
741
742 UnpackComponents(format, pGather, pResult);
743
744 // cast to fp32
745 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
746 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
747 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
748 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
749 }
750
751 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
752 {
753 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
754
755 for (uint32_t c = 0; c < info.numComps; ++c)
756 {
757 uint32_t compIndex = info.swizzle[c];
758
759 // skip any conversion on UNUSED components
760 if (info.type[c] == SWR_TYPE_UNUSED)
761 {
762 continue;
763 }
764
765 if (info.isNormalized[c])
766 {
767 if (info.type[c] == SWR_TYPE_SNORM)
768 {
769 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
770
771 /// result = c * (1.0f / (2^(n-1) - 1);
772 uint32_t n = info.bpc[c];
773 uint32_t pow2 = 1 << (n - 1);
774 float scale = 1.0f / (float)(pow2 - 1);
775 Value *vScale = VIMMED1(scale);
776 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
777 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
778 texels[compIndex] = FMUL(texels[compIndex], vScale);
779 }
780 else
781 {
782 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
783
784 /// result = c * (1.0f / (2^n - 1))
785 uint32_t n = info.bpc[c];
786 uint32_t pow2 = 1 << n;
787 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
788 if (n == 24)
789 {
790 float scale = (float)(pow2 - 1);
791 Value* vScale = VIMMED1(scale);
792 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
793 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
794 texels[compIndex] = FDIV(texels[compIndex], vScale);
795 }
796 else
797 {
798 float scale = 1.0f / (float)(pow2 - 1);
799 Value *vScale = VIMMED1(scale);
800 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
801 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
802 texels[compIndex] = FMUL(texels[compIndex], vScale);
803 }
804 }
805 continue;
806 }
807 }
808 }
809
810 //////////////////////////////////////////////////////////////////////////
811 /// @brief Loads attributes from memory using AVX2 GATHER(s)
812 /// @param fetchState - info about attributes to be fetched from memory
813 /// @param streams - value pointer to the current vertex stream
814 /// @param vIndices - vector value of indices to gather
815 /// @param pVtxOut - value pointer to output simdvertex struct
816 #if USE_SIMD16_SHADERS
817 #if USE_SIMD16_GATHERS
818 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
819 Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
820 #else
821 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
822 Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
823 #endif
824 #else
825 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
826 Value* streams, Value* vIndices, Value* pVtxOut)
827 #endif
828 {
829 uint32_t currentVertexElement = 0;
830 uint32_t outputElt = 0;
831 Value* vVertexElements[4];
832 #if USE_SIMD16_GATHERS
833 Value* vVertexElements2[4];
834 #if USE_SIMD16_BUILDER
835 Value *pVtxSrc2[4];
836 #endif
837 #endif
838
839 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
840 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
841 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
842 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
843 curInstance->setName("curInstance");
844
845 for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
846 {
847 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
848
849 // skip element if all components are disabled
850 if (ied.ComponentPacking == ComponentEnable::NONE)
851 {
852 continue;
853 }
854
855 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
856 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
857 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
858
859 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
860
861 // VGATHER* takes an *i8 src pointer
862 Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
863
864 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
865 Value *vStride = VBROADCAST(stride);
866
867 // max vertex index that is fully in bounds
868 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
869 maxVertex = LOAD(maxVertex);
870
871 Value *minVertex = NULL;
872 if (fetchState.bPartialVertexBuffer)
873 {
874 // min vertex index for low bounds OOB checking
875 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
876 minVertex = LOAD(minVertex);
877 }
878
879 if (fetchState.bInstanceIDOffsetEnable)
880 {
881 // the InstanceID (curInstance) value is offset by StartInstanceLocation
882 curInstance = ADD(curInstance, startInstance);
883 }
884
885 Value *vCurIndices;
886 #if USE_SIMD16_GATHERS
887 Value *vCurIndices2;
888 #endif
889 Value *startOffset;
890 Value *vInstanceStride = VIMMED1(0);
891
892 if (ied.InstanceEnable)
893 {
894 Value* stepRate = C(ied.InstanceAdvancementState);
895
896 // prevent a div by 0 for 0 step rate
897 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
898 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
899
900 // calc the current offset into instanced data buffer
901 Value* calcInstance = UDIV(curInstance, stepRate);
902
903 // if step rate is 0, every instance gets instance 0
904 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
905
906 vCurIndices = VBROADCAST(calcInstance);
907 #if USE_SIMD16_GATHERS
908 vCurIndices2 = VBROADCAST(calcInstance);
909 #endif
910
911 startOffset = startInstance;
912 }
913 else if (ied.InstanceStrideEnable)
914 {
915 // grab the instance advancement state, determines stride in bytes from one instance to the next
916 Value* stepRate = C(ied.InstanceAdvancementState);
917 vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
918
919 // offset indices by baseVertex
920 vCurIndices = ADD(vIndices, vBaseVertex);
921 #if USE_SIMD16_GATHERS
922 vCurIndices2 = ADD(vIndices2, vBaseVertex);
923 #endif
924
925 startOffset = startVertex;
926 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
927 }
928 else
929 {
930 // offset indices by baseVertex
931 vCurIndices = ADD(vIndices, vBaseVertex);
932 #if USE_SIMD16_GATHERS
933 vCurIndices2 = ADD(vIndices2, vBaseVertex);
934 #endif
935
936 startOffset = startVertex;
937 }
938
939 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
940 // do 64bit address offset calculations.
941
942 // calculate byte offset to the start of the VB
943 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
944 pStreamBase = GEP(pStreamBase, baseOffset);
945
946 // if we have a start offset, subtract from max vertex. Used for OOB check
947 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
948 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
949 // if we have a negative value, we're already OOB. clamp at 0.
950 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
951
952 if (fetchState.bPartialVertexBuffer)
953 {
954 // similary for min vertex
955 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
956 Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
957 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
958 }
959
960 // Load the in bounds size of a partially valid vertex
961 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
962 partialInboundsSize = LOAD(partialInboundsSize);
963 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
964 Value* vBpp = VBROADCAST(C(info.Bpp));
965 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
966
967 // is the element is <= the partially valid size
968 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
969
970 #if USE_SIMD16_GATHERS
971 // override cur indices with 0 if pitch is 0
972 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
973 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
974 vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
975
976 // are vertices partially OOB?
977 Value* vMaxVertex = VBROADCAST(maxVertex);
978 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
979 Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex);
980
981 // are vertices fully in bounds?
982 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
983 Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex);
984
985 Value *vGatherMask;
986 Value *vGatherMask2;
987 if (fetchState.bPartialVertexBuffer)
988 {
989 // are vertices below minVertex limit?
990 Value *vMinVertex = VBROADCAST(minVertex);
991 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
992 Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex);
993
994 // only fetch lanes that pass both tests
995 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
996 vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2);
997 }
998 else
999 {
1000 vGatherMask = vMaxGatherMask;
1001 vGatherMask2 = vMaxGatherMask2;
1002 }
1003
1004 // blend in any partially OOB indices that have valid elements
1005 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1006 vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2);
1007
1008 // calculate the actual offsets into the VB
1009 Value* vOffsets = MUL(vCurIndices, vStride);
1010 vOffsets = ADD(vOffsets, vAlignmentOffsets);
1011
1012 Value* vOffsets2 = MUL(vCurIndices2, vStride);
1013 vOffsets2 = ADD(vOffsets2, vAlignmentOffsets);
1014
1015 // if instance stride enable is:
1016 // true - add product of the instanceID and advancement state to the offst into the VB
1017 // false - value of vInstanceStride has been initialialized to zero
1018 vOffsets = ADD(vOffsets, vInstanceStride);
1019 vOffsets2 = ADD(vOffsets2, vInstanceStride);
1020
1021 #else
1022 // override cur indices with 0 if pitch is 0
1023 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1024 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1025
1026 // are vertices partially OOB?
1027 Value* vMaxVertex = VBROADCAST(maxVertex);
1028 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1029
1030 // are vertices fully in bounds?
1031 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1032
1033 Value *vGatherMask;
1034 if (fetchState.bPartialVertexBuffer)
1035 {
1036 // are vertices below minVertex limit?
1037 Value *vMinVertex = VBROADCAST(minVertex);
1038 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1039
1040 // only fetch lanes that pass both tests
1041 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1042 }
1043 else
1044 {
1045 vGatherMask = vMaxGatherMask;
1046 }
1047
1048 // blend in any partially OOB indices that have valid elements
1049 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1050
1051 // calculate the actual offsets into the VB
1052 Value* vOffsets = MUL(vCurIndices, vStride);
1053 vOffsets = ADD(vOffsets, vAlignmentOffsets);
1054
1055 // if instance stride enable is:
1056 // true - add product of the instanceID and advancement state to the offst into the VB
1057 // false - value of vInstanceStride has been initialialized to zero
1058 vOffsets = ADD(vOffsets, vInstanceStride);
1059
1060 #endif
1061 // Packing and component control
1062 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1063 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1064 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1065
1066 // Special gather/conversion for formats without equal component sizes
1067 if (IsOddFormat((SWR_FORMAT)ied.Format))
1068 {
1069 #if USE_SIMD16_GATHERS
1070 Value *pResults[4];
1071 Value *pResults2[4];
1072 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1073 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1074 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1075 ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1076
1077 for (uint32_t c = 0; c < 4; c += 1)
1078 {
1079 if (isComponentEnabled(compMask, c))
1080 {
1081 #if USE_SIMD16_BUILDER
1082 // pack adjacent pairs of SIMD8s into SIMD16s
1083 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1084 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c], 0);
1085 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1);
1086
1087 #else
1088 vVertexElements[currentVertexElement] = pResults[c];
1089 vVertexElements2[currentVertexElement] = pResults2[c];
1090
1091 #endif
1092 currentVertexElement += 1;
1093
1094 if (currentVertexElement > 3)
1095 {
1096 #if USE_SIMD16_BUILDER
1097 // store SIMD16s
1098 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1099
1100 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1101
1102 #else
1103 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1104 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1105
1106 #endif
1107 outputElt += 1;
1108
1109 // reset to the next vVertexElement to output
1110 currentVertexElement = 0;
1111 }
1112 }
1113 }
1114 #else
1115 Value* pResults[4];
1116 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1117 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1118
1119 for (uint32_t c = 0; c < 4; ++c)
1120 {
1121 if (isComponentEnabled(compMask, c))
1122 {
1123 vVertexElements[currentVertexElement++] = pResults[c];
1124 if (currentVertexElement > 3)
1125 {
1126 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1127 // reset to the next vVertexElement to output
1128 currentVertexElement = 0;
1129 }
1130 }
1131 }
1132 #endif
1133 }
1134 else if(info.type[0] == SWR_TYPE_FLOAT)
1135 {
1136 ///@todo: support 64 bit vb accesses
1137 Value *gatherSrc = VIMMED1(0.0f);
1138 #if USE_SIMD16_GATHERS
1139 Value *gatherSrc2 = VIMMED1(0.0f);
1140 #if USE_SIMD16_BUILDER
1141 Value *gatherSrc16 = VIMMED2_1(0.0f);
1142 #endif
1143 #endif
1144
1145 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1146 "Unsupported format for standard gather fetch.");
1147
1148 // Gather components from memory to store in a simdvertex structure
1149 switch (bpc)
1150 {
1151 case 16:
1152 {
1153 #if USE_SIMD16_GATHERS
1154 Value *vGatherResult[2];
1155 Value *vGatherResult2[2];
1156
1157 // if we have at least one component out of x or y to fetch
1158 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1159 {
1160 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1161 vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1162 // e.g. result of first 8x32bit integer gather for 16bit components
1163 // 256i - 0 1 2 3 4 5 6 7
1164 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1165 //
1166 }
1167 else
1168 {
1169 vGatherResult[0] = VUNDEF_I();
1170 vGatherResult2[0] = VUNDEF_I();
1171 }
1172
1173 // if we have at least one component out of z or w to fetch
1174 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1175 {
1176 // offset base to the next components(zw) in the vertex to gather
1177 pStreamBase = GEP(pStreamBase, C((char)4));
1178
1179 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1180 vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1181 // e.g. result of second 8x32bit integer gather for 16bit components
1182 // 256i - 0 1 2 3 4 5 6 7
1183 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1184 //
1185 }
1186 else
1187 {
1188 vGatherResult[1] = VUNDEF_I();
1189 vGatherResult2[1] = VUNDEF_I();
1190 }
1191
1192 // if we have at least one component to shuffle into place
1193 if (compMask)
1194 {
1195 #if USE_SIMD16_BUILDER
1196 Value *gatherResult[2];
1197
1198 gatherResult[0] = VUNDEF2_I();
1199 gatherResult[1] = VUNDEF2_I();
1200
1201 gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0);
1202 gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
1203
1204 gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0);
1205 gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
1206
1207 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1208
1209 Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
1210 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1211
1212 // Shuffle gathered components into place in simdvertex struct
1213 Shuffle16bpcGather2(args); // outputs to vVertexElements ref
1214 #else
1215 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1216 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1217 Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
1218 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1219
1220 // Shuffle gathered components into place in simdvertex struct
1221 Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
1222 Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
1223 #endif
1224 }
1225 #else
1226 Value* vGatherResult[2];
1227
1228 // if we have at least one component out of x or y to fetch
1229 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1230 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1231 // e.g. result of first 8x32bit integer gather for 16bit components
1232 // 256i - 0 1 2 3 4 5 6 7
1233 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1234 //
1235 }
1236
1237 // if we have at least one component out of z or w to fetch
1238 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1239 // offset base to the next components(zw) in the vertex to gather
1240 pStreamBase = GEP(pStreamBase, C((char)4));
1241
1242 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1243 // e.g. result of second 8x32bit integer gather for 16bit components
1244 // 256i - 0 1 2 3 4 5 6 7
1245 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1246 //
1247 }
1248
1249 // if we have at least one component to shuffle into place
1250 if(compMask){
1251 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1252 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1253
1254 // Shuffle gathered components into place in simdvertex struct
1255 #if USE_SIMD16_SHADERS
1256 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
1257 #else
1258 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1259 #endif
1260 }
1261 #endif
1262 }
1263 break;
1264 case 32:
1265 {
1266 for (uint32_t i = 0; i < 4; i += 1)
1267 {
1268 #if USE_SIMD16_GATHERS
1269 if (isComponentEnabled(compMask, i))
1270 {
1271 // if we need to gather the component
1272 if (compCtrl[i] == StoreSrc)
1273 {
1274 // Gather a SIMD of vertices
1275 // APIs allow a 4GB range for offsets
1276 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1277 // But, we know that elements must be aligned for FETCH. :)
1278 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1279 Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
1280 Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
1281 #if USE_SIMD16_BUILDER
1282 Value *indices = VUNDEF2_I();
1283 indices = INSERT2_I(indices, vShiftedOffsets, 0);
1284 indices = INSERT2_I(indices, vShiftedOffsets2, 1);
1285
1286 Value *mask = VSHUFFLE(vGatherMask, vGatherMask2, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1287
1288 pVtxSrc2[currentVertexElement] = GATHERPS2(gatherSrc16, pStreamBase, indices, mask, 2);
1289 #else
1290 vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1291 vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
1292
1293 #if USE_SIMD16_BUILDER
1294 // pack adjacent pairs of SIMD8s into SIMD16s
1295 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1296 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement], 0);
1297 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1298
1299 #endif
1300 #endif
1301 currentVertexElement += 1;
1302 }
1303 else
1304 {
1305 #if USE_SIMD16_BUILDER
1306 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1307 #else
1308 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1309 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1310
1311 #if USE_SIMD16_BUILDER
1312 // pack adjacent pairs of SIMD8s into SIMD16s
1313 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1314 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement], 0);
1315 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1);
1316
1317 #endif
1318 #endif
1319 currentVertexElement += 1;
1320 }
1321
1322 if (currentVertexElement > 3)
1323 {
1324 #if USE_SIMD16_BUILDER
1325 // store SIMD16s
1326 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1327
1328 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1329
1330 #else
1331 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1332 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1333
1334 #endif
1335 outputElt += 1;
1336
1337 // reset to the next vVertexElement to output
1338 currentVertexElement = 0;
1339 }
1340 }
1341
1342 // offset base to the next component in the vertex to gather
1343 pStreamBase = GEP(pStreamBase, C((char)4));
1344 #else
1345 if (isComponentEnabled(compMask, i))
1346 {
1347 // if we need to gather the component
1348 if (compCtrl[i] == StoreSrc)
1349 {
1350 // Gather a SIMD of vertices
1351 // APIs allow a 4GB range for offsets
1352 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1353 // But, we know that elements must be aligned for FETCH. :)
1354 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1355 Value* vShiftedOffsets = VPSRLI(vOffsets, C(1));
1356 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
1357 }
1358 else
1359 {
1360 #if USE_SIMD16_SHADERS
1361 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1362 #else
1363 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1364 #endif
1365 }
1366
1367 if (currentVertexElement > 3)
1368 {
1369 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1370 // reset to the next vVertexElement to output
1371 currentVertexElement = 0;
1372 }
1373 }
1374
1375 // offset base to the next component in the vertex to gather
1376 pStreamBase = GEP(pStreamBase, C((char)4));
1377 #endif
1378 }
1379 }
1380 break;
1381 case 64:
1382 {
1383 for (uint32_t i = 0; i < 4; i += 1)
1384 {
1385 #if USE_SIMD16_GATHERS
1386 if (isComponentEnabled(compMask, i))
1387 {
1388 // if we need to gather the component
1389 if (compCtrl[i] == StoreSrc)
1390 {
1391 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1392 Value *vMaskLo2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1393 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1394 Value *vMaskHi2 = VSHUFFLE(vGatherMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1395
1396 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1397 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1398 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1399 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1400
1401 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1402
1403 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1404 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1405 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1406 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1407
1408 pGatherLo = VCVTPD2PS(pGatherLo);
1409 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1410 pGatherHi = VCVTPD2PS(pGatherHi);
1411 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1412
1413 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1414 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1415
1416 #if USE_SIMD16_BUILDER
1417 // pack adjacent pairs of SIMD8s into SIMD16s
1418 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1419 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0);
1420 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
1421
1422 #else
1423 vVertexElements[currentVertexElement] = pGather;
1424 vVertexElements2[currentVertexElement] = pGather2;
1425
1426 #endif
1427 currentVertexElement += 1;
1428 }
1429 else
1430 {
1431 #if USE_SIMD16_BUILDER
1432 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1433
1434 #else
1435 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1436 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1437
1438 #endif
1439 currentVertexElement += 1;
1440 }
1441
1442 if (currentVertexElement > 3)
1443 {
1444 #if USE_SIMD16_BUILDER
1445 // store SIMD16s
1446 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1447
1448 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1449
1450 #else
1451 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1452 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1453
1454 #endif
1455 outputElt += 1;
1456
1457 // reset to the next vVertexElement to output
1458 currentVertexElement = 0;
1459 }
1460 }
1461
1462 // offset base to the next component in the vertex to gather
1463 pStreamBase = GEP(pStreamBase, C((char)8));
1464 #else
1465 if (isComponentEnabled(compMask, i))
1466 {
1467 // if we need to gather the component
1468 if (compCtrl[i] == StoreSrc)
1469 {
1470 Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1471 Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1472
1473 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1474 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1475
1476 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1477
1478 Value* pGatherLo = GATHERPD(vZeroDouble,
1479 pStreamBase, vOffsetsLo, vMaskLo);
1480 Value* pGatherHi = GATHERPD(vZeroDouble,
1481 pStreamBase, vOffsetsHi, vMaskHi);
1482
1483 pGatherLo = VCVTPD2PS(pGatherLo);
1484 pGatherHi = VCVTPD2PS(pGatherHi);
1485
1486 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1487
1488 vVertexElements[currentVertexElement++] = pGather;
1489 }
1490 else
1491 {
1492 #if USE_SIMD16_SHADERS
1493 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1494 #else
1495 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1496 #endif
1497 }
1498
1499 if (currentVertexElement > 3)
1500 {
1501 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1502 // reset to the next vVertexElement to output
1503 currentVertexElement = 0;
1504 }
1505 }
1506
1507 // offset base to the next component in the vertex to gather
1508 pStreamBase = GEP(pStreamBase, C((char)8));
1509 #endif
1510 }
1511 }
1512 break;
1513 default:
1514 SWR_INVALID("Tried to fetch invalid FP format");
1515 break;
1516 }
1517 }
1518 else
1519 {
1520 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1521 ConversionType conversionType = CONVERT_NONE;
1522
1523 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1524 "Unsupported format for standard gather fetch.");
1525
1526 switch(info.type[0])
1527 {
1528 case SWR_TYPE_UNORM:
1529 conversionType = CONVERT_NORMALIZED;
1530 case SWR_TYPE_UINT:
1531 extendCastType = Instruction::CastOps::ZExt;
1532 break;
1533 case SWR_TYPE_SNORM:
1534 conversionType = CONVERT_NORMALIZED;
1535 case SWR_TYPE_SINT:
1536 extendCastType = Instruction::CastOps::SExt;
1537 break;
1538 case SWR_TYPE_USCALED:
1539 conversionType = CONVERT_USCALED;
1540 extendCastType = Instruction::CastOps::UIToFP;
1541 break;
1542 case SWR_TYPE_SSCALED:
1543 conversionType = CONVERT_SSCALED;
1544 extendCastType = Instruction::CastOps::SIToFP;
1545 break;
1546 case SWR_TYPE_SFIXED:
1547 conversionType = CONVERT_SFIXED;
1548 extendCastType = Instruction::CastOps::SExt;
1549 break;
1550 default:
1551 break;
1552 }
1553
1554 // value substituted when component of gather is masked
1555 Value* gatherSrc = VIMMED1(0);
1556 #if USE_SIMD16_GATHERS
1557 Value* gatherSrc2 = VIMMED1(0);
1558 #endif
1559
1560 // Gather components from memory to store in a simdvertex structure
1561 switch (bpc)
1562 {
1563 case 8:
1564 {
1565 // if we have at least one component to fetch
1566 if (compMask)
1567 {
1568 #if USE_SIMD16_GATHERS
1569 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1570 Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1571
1572 // e.g. result of an 8x32bit integer gather for 8bit components
1573 // 256i - 0 1 2 3 4 5 6 7
1574 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1575
1576 #if USE_SIMD16_BUILDER
1577 Value *gatherResult = VUNDEF2_I();
1578
1579 gatherResult = INSERT2_I(gatherResult, vGatherResult, 0);
1580 gatherResult = INSERT2_I(gatherResult, vGatherResult2, 1);
1581
1582 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1583
1584 Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1585 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
1586
1587 // Shuffle gathered components into place in simdvertex struct
1588 Shuffle8bpcGatherd2(args); // outputs to vVertexElements ref
1589 #else
1590 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1591 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1592 Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1593 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle);
1594
1595 // Shuffle gathered components into place in simdvertex struct
1596 Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
1597 Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
1598 #endif
1599 #else
1600 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1601 // e.g. result of an 8x32bit integer gather for 8bit components
1602 // 256i - 0 1 2 3 4 5 6 7
1603 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1604
1605 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1606 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1607
1608 // Shuffle gathered components into place in simdvertex struct
1609 #if USE_SIMD16_SHADERS
1610 Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1611 #else
1612 Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1613 #endif
1614 #endif
1615 }
1616 }
1617 break;
1618 case 16:
1619 {
1620 #if USE_SIMD16_GATHERS
1621 Value* vGatherResult[2];
1622 Value* vGatherResult2[2];
1623
1624 // if we have at least one component out of x or y to fetch
1625 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1626 {
1627 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1628 vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1629 // e.g. result of first 8x32bit integer gather for 16bit components
1630 // 256i - 0 1 2 3 4 5 6 7
1631 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1632 //
1633 }
1634 else
1635 {
1636 vGatherResult[0] = VUNDEF_I();
1637 vGatherResult2[0] = VUNDEF_I();
1638 }
1639
1640 // if we have at least one component out of z or w to fetch
1641 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1642 {
1643 // offset base to the next components(zw) in the vertex to gather
1644 pStreamBase = GEP(pStreamBase, C((char)4));
1645
1646 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1647 vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1648 // e.g. result of second 8x32bit integer gather for 16bit components
1649 // 256i - 0 1 2 3 4 5 6 7
1650 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1651 //
1652 }
1653 else
1654 {
1655 vGatherResult[1] = VUNDEF_I();
1656 vGatherResult2[1] = VUNDEF_I();
1657 }
1658
1659 // if we have at least one component to shuffle into place
1660 if (compMask)
1661 {
1662 #if USE_SIMD16_BUILDER
1663 Value *gatherResult[2];
1664
1665 gatherResult[0] = VUNDEF2_I();
1666 gatherResult[1] = VUNDEF2_I();
1667
1668 gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0);
1669 gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
1670
1671 gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0);
1672 gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
1673
1674 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1675
1676 Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
1677 currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
1678
1679 // Shuffle gathered components into place in simdvertex struct
1680 Shuffle16bpcGather2(args); // outputs to vVertexElements ref
1681 #else
1682 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1683 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1684 Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1685 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1686
1687 // Shuffle gathered components into place in simdvertex struct
1688 Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
1689 Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
1690 #endif
1691 }
1692 #else
1693 Value* vGatherResult[2];
1694
1695 // if we have at least one component out of x or y to fetch
1696 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1697 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1698 // e.g. result of first 8x32bit integer gather for 16bit components
1699 // 256i - 0 1 2 3 4 5 6 7
1700 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1701 //
1702 }
1703
1704 // if we have at least one component out of z or w to fetch
1705 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1706 // offset base to the next components(zw) in the vertex to gather
1707 pStreamBase = GEP(pStreamBase, C((char)4));
1708
1709 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1710 // e.g. result of second 8x32bit integer gather for 16bit components
1711 // 256i - 0 1 2 3 4 5 6 7
1712 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1713 //
1714 }
1715
1716 // if we have at least one component to shuffle into place
1717 if(compMask){
1718 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1719 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1720
1721 // Shuffle gathered components into place in simdvertex struct
1722 #if USE_SIMD16_SHADERS
1723 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
1724 #else
1725 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1726 #endif
1727 }
1728 #endif
1729 }
1730 break;
1731 case 32:
1732 {
1733 // Gathered components into place in simdvertex struct
1734 for (uint32_t i = 0; i < 4; i++)
1735 {
1736 if (isComponentEnabled(compMask, i))
1737 {
1738 // if we need to gather the component
1739 if (compCtrl[i] == StoreSrc)
1740 {
1741 #if USE_SIMD16_GATHERS
1742 Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1743 Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1744
1745 if (conversionType == CONVERT_USCALED)
1746 {
1747 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1748 pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty);
1749 }
1750 else if (conversionType == CONVERT_SSCALED)
1751 {
1752 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1753 pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty);
1754 }
1755 else if (conversionType == CONVERT_SFIXED)
1756 {
1757 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1758 pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1759 }
1760
1761 #if USE_SIMD16_BUILDER
1762 // pack adjacent pairs of SIMD8s into SIMD16s
1763 pVtxSrc2[currentVertexElement] = VUNDEF2_F();
1764 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0);
1765 pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
1766
1767 #else
1768 vVertexElements[currentVertexElement] = pGather;
1769 vVertexElements2[currentVertexElement] = pGather2;
1770
1771 #endif
1772
1773 // e.g. result of a single 8x32bit integer gather for 32bit components
1774 // 256i - 0 1 2 3 4 5 6 7
1775 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1776
1777 currentVertexElement += 1;
1778 #else
1779 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1780
1781 if (conversionType == CONVERT_USCALED)
1782 {
1783 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1784 }
1785 else if (conversionType == CONVERT_SSCALED)
1786 {
1787 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1788 }
1789 else if (conversionType == CONVERT_SFIXED)
1790 {
1791 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1792 }
1793
1794 vVertexElements[currentVertexElement++] = pGather;
1795 // e.g. result of a single 8x32bit integer gather for 32bit components
1796 // 256i - 0 1 2 3 4 5 6 7
1797 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1798 #endif
1799 }
1800 else
1801 {
1802 #if USE_SIMD16_SHADERS
1803 #if USE_SIMD16_GATHERS
1804 #if USE_SIMD16_BUILDER
1805 pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
1806
1807 #else
1808 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1809 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1810
1811 #endif
1812 currentVertexElement += 1;
1813 #else
1814 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1815 #endif
1816 #else
1817 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1818 #endif
1819 }
1820
1821 if (currentVertexElement > 3)
1822 {
1823 #if USE_SIMD16_GATHERS
1824 #if USE_SIMD16_BUILDER
1825 // store SIMD16s
1826 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1827
1828 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1829
1830 #else
1831 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1832 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1833
1834 #endif
1835 outputElt += 1;
1836 #else
1837 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1838 #endif
1839
1840 // reset to the next vVertexElement to output
1841 currentVertexElement = 0;
1842 }
1843
1844 }
1845
1846 // offset base to the next component in the vertex to gather
1847 pStreamBase = GEP(pStreamBase, C((char)4));
1848 }
1849 }
1850 break;
1851 }
1852 }
1853 }
1854
1855 // if we have a partially filled vVertexElement struct, output it
1856 if (currentVertexElement > 0)
1857 {
1858 #if USE_SIMD16_GATHERS
1859 #if USE_SIMD16_BUILDER
1860 // store SIMD16s
1861 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1862
1863 StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2);
1864
1865 #else
1866 StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
1867 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
1868
1869 #endif
1870 outputElt += 1;
1871 #else
1872 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1873 #endif
1874 }
1875 }
1876
1877 //////////////////////////////////////////////////////////////////////////
1878 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1879 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1880 /// support
1881 /// @param pIndices - pointer to 8 bit indices
1882 /// @param pLastIndex - pointer to last valid index
1883 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1884 {
1885 // can fit 2 16 bit integers per vWidth lane
1886 Value* vIndices = VUNDEF_I();
1887
1888 // store 0 index on stack to be used to conditionally load from if index address is OOB
1889 Value* pZeroIndex = ALLOCA(mInt8Ty);
1890 STORE(C((uint8_t)0), pZeroIndex);
1891
1892 // Load a SIMD of index pointers
1893 for(int64_t lane = 0; lane < mVWidth; lane++)
1894 {
1895 // Calculate the address of the requested index
1896 Value *pIndex = GEP(pIndices, C(lane));
1897
1898 // check if the address is less than the max index,
1899 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1900
1901 // if valid, load the index. if not, load 0 from the stack
1902 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1903 Value *index = LOAD(pValid, "valid index");
1904
1905 // zero extended index to 32 bits and insert into the correct simd lane
1906 index = Z_EXT(index, mInt32Ty);
1907 vIndices = VINSERT(vIndices, index, lane);
1908 }
1909 return vIndices;
1910 }
1911
1912 //////////////////////////////////////////////////////////////////////////
1913 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1914 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1915 /// support
1916 /// @param pIndices - pointer to 16 bit indices
1917 /// @param pLastIndex - pointer to last valid index
1918 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1919 {
1920 // can fit 2 16 bit integers per vWidth lane
1921 Value* vIndices = VUNDEF_I();
1922
1923 // store 0 index on stack to be used to conditionally load from if index address is OOB
1924 Value* pZeroIndex = ALLOCA(mInt16Ty);
1925 STORE(C((uint16_t)0), pZeroIndex);
1926
1927 // Load a SIMD of index pointers
1928 for(int64_t lane = 0; lane < mVWidth; lane++)
1929 {
1930 // Calculate the address of the requested index
1931 Value *pIndex = GEP(pIndices, C(lane));
1932
1933 // check if the address is less than the max index,
1934 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1935
1936 // if valid, load the index. if not, load 0 from the stack
1937 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1938 Value *index = LOAD(pValid, "valid index");
1939
1940 // zero extended index to 32 bits and insert into the correct simd lane
1941 index = Z_EXT(index, mInt32Ty);
1942 vIndices = VINSERT(vIndices, index, lane);
1943 }
1944 return vIndices;
1945 }
1946
1947 //////////////////////////////////////////////////////////////////////////
1948 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1949 /// @param pIndices - pointer to 32 bit indices
1950 /// @param pLastIndex - pointer to last valid index
1951 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1952 {
1953 DataLayout dL(JM()->mpCurrentModule);
1954 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1955 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1956 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1957
1958 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1959 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1960 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1961 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1962
1963 // create a vector of index counts from the base index ptr passed into the fetch
1964 const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1965 Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1966
1967 // compare index count to the max valid index
1968 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1969 // vIndexOffsets 0 1 2 3 4 5 6 7
1970 // ------------------------------
1971 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1972 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1973 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1974 Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1975
1976 // VMASKLOAD takes an *i8 src pointer
1977 pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1978
1979 // Load the indices; OOB loads 0
1980 return MASKLOADD(pIndices,vIndexMask);
1981 }
1982
1983 //////////////////////////////////////////////////////////////////////////
1984 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1985 /// denormalizes if needed, converts to F32 if needed, and positions in
1986 // the proper SIMD rows to be output to the simdvertex structure
1987 /// @param args: (tuple of args, listed below)
1988 /// @param vGatherResult - 8 gathered 8bpc vertices
1989 /// @param pVtxOut - base pointer to output simdvertex struct
1990 /// @param extendType - sign extend or zero extend
1991 /// @param bNormalized - do we need to denormalize?
1992 /// @param currentVertexElement - reference to the current vVertexElement
1993 /// @param outputElt - reference to the current offset from simdvertex we're o
1994 /// @param compMask - component packing mask
1995 /// @param compCtrl - component control val
1996 /// @param vVertexElements[4] - vertex components to output
1997 /// @param swizzle[4] - component swizzle location
1998 #if USE_SIMD16_SHADERS
1999 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
2000 #else
2001 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
2002 #endif
2003 {
2004 // Unpack tuple args
2005 Value*& vGatherResult = std::get<0>(args);
2006 Value* pVtxOut = std::get<1>(args);
2007 const Instruction::CastOps extendType = std::get<2>(args);
2008 const ConversionType conversionType = std::get<3>(args);
2009 uint32_t &currentVertexElement = std::get<4>(args);
2010 uint32_t &outputElt = std::get<5>(args);
2011 const ComponentEnable compMask = std::get<6>(args);
2012 const ComponentControl (&compCtrl)[4] = std::get<7>(args);
2013 Value* (&vVertexElements)[4] = std::get<8>(args);
2014 const uint32_t (&swizzle)[4] = std::get<9>(args);
2015
2016 // cast types
2017 Type* vGatherTy = mSimdInt32Ty;
2018 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
2019
2020 // have to do extra work for sign extending
2021 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
2022 Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
2023 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2024
2025 // shuffle mask, including any swizzling
2026 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
2027 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
2028 Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
2029 char(y), char(y+4), char(y+8), char(y+12),
2030 char(z), char(z+4), char(z+8), char(z+12),
2031 char(w), char(w+4), char(w+8), char(w+12),
2032 char(x), char(x+4), char(x+8), char(x+12),
2033 char(y), char(y+4), char(y+8), char(y+12),
2034 char(z), char(z+4), char(z+8), char(z+12),
2035 char(w), char(w+4), char(w+8), char(w+12)});
2036
2037 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2038 // after pshufb: group components together in each 128bit lane
2039 // 256i - 0 1 2 3 4 5 6 7
2040 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
2041
2042 Value* vi128XY = nullptr;
2043 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2044 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
2045 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
2046 // 256i - 0 1 2 3 4 5 6 7
2047 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
2048 }
2049
2050 // do the same for zw components
2051 Value* vi128ZW = nullptr;
2052 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2053 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
2054 }
2055
2056 // init denormalize variables if needed
2057 Instruction::CastOps fpCast;
2058 Value* conversionFactor;
2059
2060 switch (conversionType)
2061 {
2062 case CONVERT_NORMALIZED:
2063 fpCast = Instruction::CastOps::SIToFP;
2064 conversionFactor = VIMMED1((float)(1.0 / 127.0));
2065 break;
2066 case CONVERT_SSCALED:
2067 fpCast = Instruction::CastOps::SIToFP;
2068 conversionFactor = VIMMED1((float)(1.0));
2069 break;
2070 case CONVERT_USCALED:
2071 SWR_INVALID("Type should not be sign extended!");
2072 conversionFactor = nullptr;
2073 break;
2074 default:
2075 SWR_ASSERT(conversionType == CONVERT_NONE);
2076 conversionFactor = nullptr;
2077 break;
2078 }
2079
2080 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2081 for (uint32_t i = 0; i < 4; i++)
2082 {
2083 if (isComponentEnabled(compMask, i))
2084 {
2085 if (compCtrl[i] == ComponentControl::StoreSrc)
2086 {
2087 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2088 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2089 // if x or y, use vi128XY permute result, else use vi128ZW
2090 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2091
2092 // sign extend
2093 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
2094
2095 // denormalize if needed
2096 if (conversionType != CONVERT_NONE)
2097 {
2098 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2099 }
2100 currentVertexElement++;
2101 }
2102 else
2103 {
2104 #if USE_SIMD16_SHADERS
2105 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2106 #else
2107 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2108 #endif
2109 }
2110
2111 if (currentVertexElement > 3)
2112 {
2113 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2114 // reset to the next vVertexElement to output
2115 currentVertexElement = 0;
2116 }
2117 }
2118 }
2119 }
2120 // else zero extend
2121 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2122 {
2123 // init denormalize variables if needed
2124 Instruction::CastOps fpCast;
2125 Value* conversionFactor;
2126
2127 switch (conversionType)
2128 {
2129 case CONVERT_NORMALIZED:
2130 fpCast = Instruction::CastOps::UIToFP;
2131 conversionFactor = VIMMED1((float)(1.0 / 255.0));
2132 break;
2133 case CONVERT_USCALED:
2134 fpCast = Instruction::CastOps::UIToFP;
2135 conversionFactor = VIMMED1((float)(1.0));
2136 break;
2137 case CONVERT_SSCALED:
2138 SWR_INVALID("Type should not be zero extended!");
2139 conversionFactor = nullptr;
2140 break;
2141 default:
2142 SWR_ASSERT(conversionType == CONVERT_NONE);
2143 conversionFactor = nullptr;
2144 break;
2145 }
2146
2147 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2148 for (uint32_t i = 0; i < 4; i++)
2149 {
2150 if (isComponentEnabled(compMask, i))
2151 {
2152 if (compCtrl[i] == ComponentControl::StoreSrc)
2153 {
2154 // pshufb masks for each component
2155 Value* vConstMask;
2156 switch (swizzle[i])
2157 {
2158 case 0:
2159 // x shuffle mask
2160 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2161 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2162 break;
2163 case 1:
2164 // y shuffle mask
2165 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2166 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2167 break;
2168 case 2:
2169 // z shuffle mask
2170 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2171 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2172 break;
2173 case 3:
2174 // w shuffle mask
2175 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2176 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2177 break;
2178 default:
2179 vConstMask = nullptr;
2180 break;
2181 }
2182
2183 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2184 // after pshufb for x channel
2185 // 256i - 0 1 2 3 4 5 6 7
2186 // x000 x000 x000 x000 x000 x000 x000 x000
2187
2188 // denormalize if needed
2189 if (conversionType != CONVERT_NONE)
2190 {
2191 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2192 }
2193 currentVertexElement++;
2194 }
2195 else
2196 {
2197 #if USE_SIMD16_SHADERS
2198 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2199 #else
2200 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2201 #endif
2202 }
2203
2204 if (currentVertexElement > 3)
2205 {
2206 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2207 // reset to the next vVertexElement to output
2208 currentVertexElement = 0;
2209 }
2210 }
2211 }
2212 }
2213 else
2214 {
2215 SWR_INVALID("Unsupported conversion type");
2216 }
2217 }
2218
2219 #if USE_SIMD16_BUILDER
2220 void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
2221 {
2222 // Unpack tuple args
2223 Value*& vGatherResult = std::get<0>(args);
2224 Value* pVtxOut = std::get<1>(args);
2225 const Instruction::CastOps extendType = std::get<2>(args);
2226 const ConversionType conversionType = std::get<3>(args);
2227 uint32_t &currentVertexElement = std::get<4>(args);
2228 uint32_t &outputElt = std::get<5>(args);
2229 const ComponentEnable compMask = std::get<6>(args);
2230 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2231 Value* (&vVertexElements)[4] = std::get<8>(args);
2232 const uint32_t(&swizzle)[4] = std::get<9>(args);
2233
2234 // cast types
2235 Type *vGatherTy = mSimdInt32Ty;
2236 Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2237
2238 // have to do extra work for sign extending
2239 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
2240 {
2241 Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
2242 Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2243
2244 // shuffle mask, including any swizzling
2245 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
2246 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
2247 Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
2248 char(y), char(y + 4), char(y + 8), char(y + 12),
2249 char(z), char(z + 4), char(z + 8), char(z + 12),
2250 char(w), char(w + 4), char(w + 8), char(w + 12),
2251 char(x), char(x + 4), char(x + 8), char(x + 12),
2252 char(y), char(y + 4), char(y + 8), char(y + 12),
2253 char(z), char(z + 4), char(z + 8), char(z + 12),
2254 char(w), char(w + 4), char(w + 8), char(w + 12) });
2255
2256 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2257
2258 Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
2259 Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
2260
2261 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2262 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2263
2264 // after pshufb: group components together in each 128bit lane
2265 // 256i - 0 1 2 3 4 5 6 7
2266 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
2267
2268 Value *vi128XY_lo = nullptr;
2269 Value *vi128XY_hi = nullptr;
2270 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2271 {
2272 vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
2273 vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
2274
2275 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
2276 // 256i - 0 1 2 3 4 5 6 7
2277 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
2278 }
2279
2280 // do the same for zw components
2281 Value *vi128ZW_lo = nullptr;
2282 Value *vi128ZW_hi = nullptr;
2283 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2284 {
2285 vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
2286 vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
2287 }
2288
2289 // init denormalize variables if needed
2290 Instruction::CastOps fpCast;
2291 Value *conversionFactor;
2292
2293 switch (conversionType)
2294 {
2295 case CONVERT_NORMALIZED:
2296 fpCast = Instruction::CastOps::SIToFP;
2297 conversionFactor = VIMMED1((float)(1.0 / 127.0));
2298 break;
2299 case CONVERT_SSCALED:
2300 fpCast = Instruction::CastOps::SIToFP;
2301 conversionFactor = VIMMED1((float)(1.0));
2302 break;
2303 case CONVERT_USCALED:
2304 SWR_INVALID("Type should not be sign extended!");
2305 conversionFactor = nullptr;
2306 break;
2307 default:
2308 SWR_ASSERT(conversionType == CONVERT_NONE);
2309 conversionFactor = nullptr;
2310 break;
2311 }
2312
2313 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2314 for (uint32_t i = 0; i < 4; i++)
2315 {
2316 if (isComponentEnabled(compMask, i))
2317 {
2318 if (compCtrl[i] == ComponentControl::StoreSrc)
2319 {
2320 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2321 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2322 // if x or y, use vi128XY permute result, else use vi128ZW
2323 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2324 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2325
2326 // sign extend
2327 Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
2328 Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
2329
2330 // denormalize if needed
2331 if (conversionType != CONVERT_NONE)
2332 {
2333 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2334 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2335 }
2336
2337 vVertexElements[currentVertexElement] = VUNDEF2_F();
2338 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2339 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2340
2341 currentVertexElement += 1;
2342 }
2343 else
2344 {
2345 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2346 }
2347
2348 if (currentVertexElement > 3)
2349 {
2350 StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2351 // reset to the next vVertexElement to output
2352 currentVertexElement = 0;
2353 }
2354 }
2355 }
2356 }
2357 // else zero extend
2358 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2359 {
2360 // init denormalize variables if needed
2361 Instruction::CastOps fpCast;
2362 Value *conversionFactor;
2363
2364 switch (conversionType)
2365 {
2366 case CONVERT_NORMALIZED:
2367 fpCast = Instruction::CastOps::UIToFP;
2368 conversionFactor = VIMMED1((float)(1.0 / 255.0));
2369 break;
2370 case CONVERT_USCALED:
2371 fpCast = Instruction::CastOps::UIToFP;
2372 conversionFactor = VIMMED1((float)(1.0));
2373 break;
2374 case CONVERT_SSCALED:
2375 SWR_INVALID("Type should not be zero extended!");
2376 conversionFactor = nullptr;
2377 break;
2378 default:
2379 SWR_ASSERT(conversionType == CONVERT_NONE);
2380 conversionFactor = nullptr;
2381 break;
2382 }
2383
2384 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2385 for (uint32_t i = 0; i < 4; i++)
2386 {
2387 if (isComponentEnabled(compMask, i))
2388 {
2389 if (compCtrl[i] == ComponentControl::StoreSrc)
2390 {
2391 // pshufb masks for each component
2392 Value *vConstMask;
2393 switch (swizzle[i])
2394 {
2395 case 0:
2396 // x shuffle mask
2397 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2398 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2399 break;
2400 case 1:
2401 // y shuffle mask
2402 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2403 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2404 break;
2405 case 2:
2406 // z shuffle mask
2407 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2408 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2409 break;
2410 case 3:
2411 // w shuffle mask
2412 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2413 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2414 break;
2415 default:
2416 vConstMask = nullptr;
2417 break;
2418 }
2419
2420 Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
2421 Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
2422
2423 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2424 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2425
2426 // after pshufb for x channel
2427 // 256i - 0 1 2 3 4 5 6 7
2428 // x000 x000 x000 x000 x000 x000 x000 x000
2429
2430 // denormalize if needed
2431 if (conversionType != CONVERT_NONE)
2432 {
2433 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2434 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2435 }
2436
2437 vVertexElements[currentVertexElement] = VUNDEF2_F();
2438 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2439 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2440
2441 currentVertexElement += 1;
2442 }
2443 else
2444 {
2445 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2446 }
2447
2448 if (currentVertexElement > 3)
2449 {
2450 StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2451 // reset to the next vVertexElement to output
2452 currentVertexElement = 0;
2453 }
2454 }
2455 }
2456 }
2457 else
2458 {
2459 SWR_INVALID("Unsupported conversion type");
2460 }
2461 }
2462
2463 #endif
2464 //////////////////////////////////////////////////////////////////////////
2465 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2466 /// denormalizes if needed, converts to F32 if needed, and positions in
2467 // the proper SIMD rows to be output to the simdvertex structure
2468 /// @param args: (tuple of args, listed below)
2469 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2470 /// @param pVtxOut - base pointer to output simdvertex struct
2471 /// @param extendType - sign extend or zero extend
2472 /// @param bNormalized - do we need to denormalize?
2473 /// @param currentVertexElement - reference to the current vVertexElement
2474 /// @param outputElt - reference to the current offset from simdvertex we're o
2475 /// @param compMask - component packing mask
2476 /// @param compCtrl - component control val
2477 /// @param vVertexElements[4] - vertex components to output
2478 #if USE_SIMD16_SHADERS
2479 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2480 #else
2481 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2482 #endif
2483 {
2484 // Unpack tuple args
2485 Value* (&vGatherResult)[2] = std::get<0>(args);
2486 Value* pVtxOut = std::get<1>(args);
2487 const Instruction::CastOps extendType = std::get<2>(args);
2488 const ConversionType conversionType = std::get<3>(args);
2489 uint32_t &currentVertexElement = std::get<4>(args);
2490 uint32_t &outputElt = std::get<5>(args);
2491 const ComponentEnable compMask = std::get<6>(args);
2492 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2493 Value* (&vVertexElements)[4] = std::get<8>(args);
2494
2495 // cast types
2496 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2497 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2498
2499 // have to do extra work for sign extending
2500 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
2501 (extendType == Instruction::CastOps::FPExt))
2502 {
2503 // is this PP float?
2504 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2505
2506 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2507 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2508
2509 // shuffle mask
2510 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2511 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
2512 Value* vi128XY = nullptr;
2513 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2514 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2515 // after pshufb: group components together in each 128bit lane
2516 // 256i - 0 1 2 3 4 5 6 7
2517 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2518
2519 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2520 // after PERMD: move and pack xy components into each 128bit lane
2521 // 256i - 0 1 2 3 4 5 6 7
2522 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2523 }
2524
2525 // do the same for zw components
2526 Value* vi128ZW = nullptr;
2527 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2528 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2529 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2530 }
2531
2532 // init denormalize variables if needed
2533 Instruction::CastOps IntToFpCast;
2534 Value* conversionFactor;
2535
2536 switch (conversionType)
2537 {
2538 case CONVERT_NORMALIZED:
2539 IntToFpCast = Instruction::CastOps::SIToFP;
2540 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2541 break;
2542 case CONVERT_SSCALED:
2543 IntToFpCast = Instruction::CastOps::SIToFP;
2544 conversionFactor = VIMMED1((float)(1.0));
2545 break;
2546 case CONVERT_USCALED:
2547 SWR_INVALID("Type should not be sign extended!");
2548 conversionFactor = nullptr;
2549 break;
2550 default:
2551 SWR_ASSERT(conversionType == CONVERT_NONE);
2552 conversionFactor = nullptr;
2553 break;
2554 }
2555
2556 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2557 for (uint32_t i = 0; i < 4; i++)
2558 {
2559 if (isComponentEnabled(compMask, i))
2560 {
2561 if (compCtrl[i] == ComponentControl::StoreSrc)
2562 {
2563 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2564 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2565 // if x or y, use vi128XY permute result, else use vi128ZW
2566 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2567
2568 if (bFP) {
2569 // extract 128 bit lanes to sign extend each component
2570 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2571 }
2572 else {
2573 // extract 128 bit lanes to sign extend each component
2574 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2575
2576 // denormalize if needed
2577 if (conversionType != CONVERT_NONE) {
2578 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2579 }
2580 }
2581 currentVertexElement++;
2582 }
2583 else
2584 {
2585 #if USE_SIMD16_SHADERS
2586 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2587 #else
2588 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2589 #endif
2590 }
2591
2592 if (currentVertexElement > 3)
2593 {
2594 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2595 // reset to the next vVertexElement to output
2596 currentVertexElement = 0;
2597 }
2598 }
2599 }
2600 }
2601 // else zero extend
2602 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2603 {
2604 // pshufb masks for each component
2605 Value* vConstMask[2];
2606 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
2607 // x/z shuffle mask
2608 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2609 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2610 }
2611
2612 if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
2613 // y/w shuffle mask
2614 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2615 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2616 }
2617
2618 // init denormalize variables if needed
2619 Instruction::CastOps fpCast;
2620 Value* conversionFactor;
2621
2622 switch (conversionType)
2623 {
2624 case CONVERT_NORMALIZED:
2625 fpCast = Instruction::CastOps::UIToFP;
2626 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2627 break;
2628 case CONVERT_USCALED:
2629 fpCast = Instruction::CastOps::UIToFP;
2630 conversionFactor = VIMMED1((float)(1.0f));
2631 break;
2632 case CONVERT_SSCALED:
2633 SWR_INVALID("Type should not be zero extended!");
2634 conversionFactor = nullptr;
2635 break;
2636 default:
2637 SWR_ASSERT(conversionType == CONVERT_NONE);
2638 conversionFactor = nullptr;
2639 break;
2640 }
2641
2642 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2643 for (uint32_t i = 0; i < 4; i++)
2644 {
2645 if (isComponentEnabled(compMask, i))
2646 {
2647 if (compCtrl[i] == ComponentControl::StoreSrc)
2648 {
2649 // select correct constMask for x/z or y/w pshufb
2650 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2651 // if x or y, use vi128XY permute result, else use vi128ZW
2652 uint32_t selectedGather = (i < 2) ? 0 : 1;
2653
2654 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2655 // after pshufb mask for x channel; z uses the same shuffle from the second gather
2656 // 256i - 0 1 2 3 4 5 6 7
2657 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2658
2659 // denormalize if needed
2660 if (conversionType != CONVERT_NONE)
2661 {
2662 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2663 }
2664 currentVertexElement++;
2665 }
2666 else
2667 {
2668 #if USE_SIMD16_SHADERS
2669 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2670 #else
2671 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2672 #endif
2673 }
2674
2675 if (currentVertexElement > 3)
2676 {
2677 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2678 // reset to the next vVertexElement to output
2679 currentVertexElement = 0;
2680 }
2681 }
2682 }
2683 }
2684 else
2685 {
2686 SWR_INVALID("Unsupported conversion type");
2687 }
2688 }
2689
2690 #if USE_SIMD16_BUILDER
2691 void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
2692 {
2693 // Unpack tuple args
2694 Value* (&vGatherResult)[2] = std::get<0>(args);
2695 Value* pVtxOut = std::get<1>(args);
2696 const Instruction::CastOps extendType = std::get<2>(args);
2697 const ConversionType conversionType = std::get<3>(args);
2698 uint32_t &currentVertexElement = std::get<4>(args);
2699 uint32_t &outputElt = std::get<5>(args);
2700 const ComponentEnable compMask = std::get<6>(args);
2701 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2702 Value* (&vVertexElements)[4] = std::get<8>(args);
2703
2704 // cast types
2705 Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2706 Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2707
2708 // have to do extra work for sign extending
2709 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
2710 {
2711 // is this PP float?
2712 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2713
2714 Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2715 Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2716
2717 // shuffle mask
2718 Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2719 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
2720 Value *vi128XY = nullptr;
2721 Value *vi128XY_lo = nullptr;
2722 Value *vi128XY_hi = nullptr;
2723 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
2724 {
2725 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2726
2727 Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[0], 0);
2728 Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[0], 1);
2729
2730 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2731 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2732
2733 // after pshufb: group components together in each 128bit lane
2734 // 256i - 0 1 2 3 4 5 6 7
2735 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2736
2737 vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2738 vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2739
2740 // after PERMD: move and pack xy components into each 128bit lane
2741 // 256i - 0 1 2 3 4 5 6 7
2742 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2743 #if 0
2744
2745 vi128XY = VUNDEF2_I();
2746 vi128XY = INSERT2_I(vi128XY, vi128XY_lo, 0);
2747 vi128XY = INSERT2_I(vi128XY, vi128XY_hi, 1);
2748 #endif
2749 }
2750
2751 // do the same for zw components
2752 Value *vi128ZW = nullptr;
2753 Value *vi128ZW_lo = nullptr;
2754 Value *vi128ZW_hi = nullptr;
2755 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
2756 {
2757 Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[1], 0);
2758 Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[1], 1);
2759
2760 Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
2761 Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
2762
2763 vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2764 vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
2765 #if 0
2766
2767 vi128ZW = VUNDEF2_I();
2768 vi128ZW = INSERT2_I(vi128ZW, vi128ZW_lo, 0);
2769 vi128ZW = INSERT2_I(vi128ZW, vi128ZW_hi, 1);
2770 #endif
2771 }
2772
2773 // init denormalize variables if needed
2774 Instruction::CastOps IntToFpCast;
2775 Value *conversionFactor;
2776
2777 switch (conversionType)
2778 {
2779 case CONVERT_NORMALIZED:
2780 IntToFpCast = Instruction::CastOps::SIToFP;
2781 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2782 break;
2783 case CONVERT_SSCALED:
2784 IntToFpCast = Instruction::CastOps::SIToFP;
2785 conversionFactor = VIMMED1((float)(1.0));
2786 break;
2787 case CONVERT_USCALED:
2788 SWR_INVALID("Type should not be sign extended!");
2789 conversionFactor = nullptr;
2790 break;
2791 default:
2792 SWR_ASSERT(conversionType == CONVERT_NONE);
2793 conversionFactor = nullptr;
2794 break;
2795 }
2796
2797 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2798 for (uint32_t i = 0; i < 4; i++)
2799 {
2800 if (isComponentEnabled(compMask, i))
2801 {
2802 if (compCtrl[i] == ComponentControl::StoreSrc)
2803 {
2804 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2805 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2806 // if x or y, use vi128XY permute result, else use vi128ZW
2807 Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
2808 Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
2809
2810 if (bFP)
2811 {
2812 // extract 128 bit lanes to sign extend each component
2813 Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2814 Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2815
2816 vVertexElements[currentVertexElement] = VUNDEF2_F();
2817 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2818 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2819 }
2820 else
2821 {
2822 // extract 128 bit lanes to sign extend each component
2823 Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
2824 Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
2825
2826 // denormalize if needed
2827 if (conversionType != CONVERT_NONE)
2828 {
2829 temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2830 temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2831 }
2832
2833 vVertexElements[currentVertexElement] = VUNDEF2_F();
2834 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2835 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2836 }
2837
2838 currentVertexElement += 1;
2839 }
2840 else
2841 {
2842 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2843 }
2844
2845 if (currentVertexElement > 3)
2846 {
2847 StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2848 // reset to the next vVertexElement to output
2849 currentVertexElement = 0;
2850 }
2851 }
2852 }
2853 }
2854 // else zero extend
2855 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2856 {
2857 // pshufb masks for each component
2858 Value *vConstMask[2];
2859
2860 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2861 {
2862 // x/z shuffle mask
2863 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2864 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2865 }
2866
2867 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2868 {
2869 // y/w shuffle mask
2870 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2871 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
2872 }
2873
2874 // init denormalize variables if needed
2875 Instruction::CastOps fpCast;
2876 Value* conversionFactor;
2877
2878 switch (conversionType)
2879 {
2880 case CONVERT_NORMALIZED:
2881 fpCast = Instruction::CastOps::UIToFP;
2882 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2883 break;
2884 case CONVERT_USCALED:
2885 fpCast = Instruction::CastOps::UIToFP;
2886 conversionFactor = VIMMED1((float)(1.0f));
2887 break;
2888 case CONVERT_SSCALED:
2889 SWR_INVALID("Type should not be zero extended!");
2890 conversionFactor = nullptr;
2891 break;
2892 default:
2893 SWR_ASSERT(conversionType == CONVERT_NONE);
2894 conversionFactor = nullptr;
2895 break;
2896 }
2897
2898 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2899 for (uint32_t i = 0; i < 4; i++)
2900 {
2901 if (isComponentEnabled(compMask, i))
2902 {
2903 if (compCtrl[i] == ComponentControl::StoreSrc)
2904 {
2905 // select correct constMask for x/z or y/w pshufb
2906 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2907 // if x or y, use vi128XY permute result, else use vi128ZW
2908 uint32_t selectedGather = (i < 2) ? 0 : 1;
2909
2910 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
2911
2912 Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[selectedGather], 0);
2913 Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[selectedGather], 1);
2914
2915 Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2916 Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2917
2918 // after pshufb mask for x channel; z uses the same shuffle from the second gather
2919 // 256i - 0 1 2 3 4 5 6 7
2920 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2921
2922 // denormalize if needed
2923 if (conversionType != CONVERT_NONE)
2924 {
2925 temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
2926 temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
2927 }
2928
2929 vVertexElements[currentVertexElement] = VUNDEF2_F();
2930 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
2931 vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
2932
2933 currentVertexElement += 1;
2934 }
2935 else
2936 {
2937 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
2938 }
2939
2940 if (currentVertexElement > 3)
2941 {
2942 StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
2943 // reset to the next vVertexElement to output
2944 currentVertexElement = 0;
2945 }
2946 }
2947 }
2948 }
2949 else
2950 {
2951 SWR_INVALID("Unsupported conversion type");
2952 }
2953 }
2954
2955 #endif
2956 //////////////////////////////////////////////////////////////////////////
2957 /// @brief Output a simdvertex worth of elements to the current outputElt
2958 /// @param pVtxOut - base address of VIN output struct
2959 /// @param outputElt - simdvertex offset in VIN to write to
2960 /// @param numEltsToStore - number of simdvertex rows to write out
2961 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2962 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2963 {
2964 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2965
2966 for(uint32_t c = 0; c < numEltsToStore; ++c)
2967 {
2968 // STORE expects FP32 x vWidth type, just bitcast if needed
2969 if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2970 {
2971 #if FETCH_DUMP_VERTEX
2972 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2973 #endif
2974 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2975 }
2976 #if FETCH_DUMP_VERTEX
2977 else
2978 {
2979 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2980 }
2981 #endif
2982 // outputElt * 4 = offsetting by the size of a simdvertex
2983 // + c offsets to a 32bit x vWidth row within the current vertex
2984 #if USE_SIMD16_SHADERS
2985 Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
2986 #else
2987 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2988 #endif
2989 STORE(vVertexElements[c], dest);
2990 }
2991 }
2992
2993 #if USE_SIMD16_BUILDER
2994 void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2995 {
2996 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2997
2998 for (uint32_t c = 0; c < numEltsToStore; ++c)
2999 {
3000 // STORE expects FP32 x vWidth type, just bitcast if needed
3001 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
3002 {
3003 #if FETCH_DUMP_VERTEX
3004 PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
3005 #endif
3006 vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty);
3007 }
3008 #if FETCH_DUMP_VERTEX
3009 else
3010 {
3011 PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
3012 }
3013 #endif
3014 // outputElt * 4 = offsetting by the size of a simdvertex
3015 // + c offsets to a 32bit x vWidth row within the current vertex
3016 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
3017 STORE(vVertexElements[c], dest);
3018 }
3019 }
3020
3021 #endif
3022 //////////////////////////////////////////////////////////////////////////
3023 /// @brief Generates a constant vector of values based on the
3024 /// ComponentControl value
3025 /// @param ctrl - ComponentControl value
3026 #if USE_SIMD16_SHADERS
3027 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
3028 #else
3029 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
3030 #endif
3031 {
3032 switch(ctrl)
3033 {
3034 case NoStore: return VUNDEF_I();
3035 case Store0: return VIMMED1(0);
3036 case Store1Fp: return VIMMED1(1.0f);
3037 case Store1Int: return VIMMED1(1);
3038 case StoreVertexId:
3039 {
3040 #if USE_SIMD16_SHADERS
3041 Value* pId;
3042 if (useVertexID2)
3043 {
3044 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
3045 }
3046 else
3047 {
3048 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
3049 }
3050 #else
3051 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
3052 #endif
3053 return VBROADCAST(pId);
3054 }
3055 case StoreInstanceId:
3056 {
3057 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
3058 return VBROADCAST(pId);
3059 }
3060 case StoreSrc:
3061 default: SWR_INVALID("Invalid component control"); return VUNDEF_I();
3062 }
3063 }
3064
3065 #if USE_SIMD16_BUILDER
3066 Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl)
3067 {
3068 switch (ctrl)
3069 {
3070 case NoStore: return VUNDEF2_I();
3071 case Store0: return VIMMED2_1(0);
3072 case Store1Fp: return VIMMED2_1(1.0f);
3073 case Store1Int: return VIMMED2_1(1);
3074 case StoreVertexId:
3075 {
3076 Value* pId = VUNDEF2_F();
3077
3078 Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
3079 Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
3080
3081 pId = INSERT2_F(pId, pId_lo, 0);
3082 pId = INSERT2_F(pId, pId_hi, 1);
3083
3084 return VBROADCAST2(pId);
3085 }
3086 case StoreInstanceId:
3087 {
3088 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
3089 return VBROADCAST2(pId);
3090 }
3091 case StoreSrc:
3092 default: SWR_INVALID("Invalid component control"); return VUNDEF2_I();
3093 }
3094 }
3095
3096 #endif
3097 //////////////////////////////////////////////////////////////////////////
3098 /// @brief Returns the enable mask for the specified component.
3099 /// @param enableMask - enable bits
3100 /// @param component - component to check if enabled.
3101 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
3102 {
3103 switch (component)
3104 {
3105 // X
3106 case 0: return (enableMask & ComponentEnable::X);
3107 // Y
3108 case 1: return (enableMask & ComponentEnable::Y);
3109 // Z
3110 case 2: return (enableMask & ComponentEnable::Z);
3111 // W
3112 case 3: return (enableMask & ComponentEnable::W);
3113
3114 default: return false;
3115 }
3116 }
3117
3118
3119 //////////////////////////////////////////////////////////////////////////
3120 /// @brief JITs from fetch shader IR
3121 /// @param hJitMgr - JitManager handle
3122 /// @param func - LLVM function IR
3123 /// @return PFN_FETCH_FUNC - pointer to fetch code
3124 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
3125 {
3126 const llvm::Function* func = (const llvm::Function*)hFunc;
3127 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
3128 PFN_FETCH_FUNC pfnFetch;
3129
3130 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
3131 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
3132 pJitMgr->mIsModuleFinalized = true;
3133
3134 #if defined(KNOB_SWRC_TRACING)
3135 char fName[1024];
3136 const char *funcName = func->getName().data();
3137 sprintf(fName, "%s.bin", funcName);
3138 FILE *fd = fopen(fName, "wb");
3139 fwrite((void *)pfnFetch, 1, 2048, fd);
3140 fclose(fd);
3141 #endif
3142
3143 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
3144
3145 return pfnFetch;
3146 }
3147
3148 //////////////////////////////////////////////////////////////////////////
3149 /// @brief JIT compiles fetch shader
3150 /// @param hJitMgr - JitManager handle
3151 /// @param state - fetch state to build function from
3152 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
3153 {
3154 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
3155
3156 pJitMgr->SetupNewModule();
3157
3158 FetchJit theJit(pJitMgr);
3159 HANDLE hFunc = theJit.Create(state);
3160
3161 return JitFetchFunc(hJitMgr, hFunc);
3162 }