dcfe8970f5c17068a4bfcb473876f8c7e5dca673
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "jit_api.h"
32 #include "fetch_jit.h"
33 #include "gen_state_llvm.h"
34 #include <sstream>
35 #include <tuple>
36
37 //#define FETCH_DUMP_VERTEX 1
38 using namespace llvm;
39 using namespace SwrJit;
40
41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43 enum ConversionType
44 {
45 CONVERT_NONE,
46 CONVERT_NORMALIZED,
47 CONVERT_USCALED,
48 CONVERT_SSCALED,
49 CONVERT_SFIXED,
50 };
51
52 //////////////////////////////////////////////////////////////////////////
53 /// Interface to Jitting a fetch shader
54 //////////////////////////////////////////////////////////////////////////
55 struct FetchJit : public Builder
56 {
57 FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
58
59 Function* Create(const FETCH_COMPILE_STATE& fetchState);
60 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
61 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
62 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
63
64 // package up Shuffle*bpcGatherd args into a tuple for convenience
65 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
66 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
67 const uint32_t(&)[4]> Shuffle8bpcArgs;
68 #if USE_SIMD16_SHADERS
69 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
70 #else
71 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
72 #endif
73
74 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
75 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
76 #if USE_SIMD16_SHADERS
77 void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
78 #else
79 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
80 #endif
81
82 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
83
84 #if USE_SIMD16_SHADERS
85 Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
86 #else
87 Value* GenerateCompCtrlVector(const ComponentControl ctrl);
88 #endif
89
90 void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
91 #if USE_SIMD16_SHADERS
92 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
93 #else
94 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
95 #endif
96
97 bool IsOddFormat(SWR_FORMAT format);
98 bool IsUniformFormat(SWR_FORMAT format);
99 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
100 void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
101 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
102
103 Value* mpFetchInfo;
104 };
105
106 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
107 {
108 std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
109 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
110
111 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
112 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
113
114 fetch->getParent()->setModuleIdentifier(fetch->getName());
115
116 IRB()->SetInsertPoint(entry);
117
118 auto argitr = fetch->arg_begin();
119
120 // Fetch shader arguments
121 mpFetchInfo = &*argitr; ++argitr;
122 mpFetchInfo->setName("fetchInfo");
123 Value* pVtxOut = &*argitr;
124 pVtxOut->setName("vtxOutput");
125 // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
126 // index 0(just the pointer to the simdvertex structure
127 // index 1(which element of the simdvertex structure to offset to(in this case 0)
128 // so the indices being i32's doesn't matter
129 // TODO: generated this GEP with a VECTOR structure type so this makes sense
130 std::vector<Value*> vtxInputIndices(2, C(0));
131 // GEP
132 pVtxOut = GEP(pVtxOut, C(0));
133 #if USE_SIMD16_SHADERS
134 #if 0
135 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth * 2), 0));
136 #else
137 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
138 #endif
139 #else
140 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
141 #endif
142
143 // SWR_FETCH_CONTEXT::pStreams
144 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
145 streams->setName("pStreams");
146
147 // SWR_FETCH_CONTEXT::pIndices
148 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
149 indices->setName("pIndices");
150
151 // SWR_FETCH_CONTEXT::pLastIndex
152 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
153 pLastIndex->setName("pLastIndex");
154
155
156 Value* vIndices;
157 #if USE_SIMD16_SHADERS
158 Value* indices2;
159 Value* vIndices2;
160 #endif
161 switch(fetchState.indexType)
162 {
163 case R8_UINT:
164 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
165 #if USE_SIMD16_SHADERS
166 indices2 = GEP(indices, C(8));
167 #endif
168 if(fetchState.bDisableIndexOOBCheck)
169 {
170 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
171 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
172 #if USE_SIMD16_SHADERS
173 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
174 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
175 #endif
176 }
177 else
178 {
179 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
180 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
181 #if USE_SIMD16_SHADERS
182 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
183 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
184 #endif
185 }
186 break;
187 case R16_UINT:
188 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
189 #if USE_SIMD16_SHADERS
190 indices2 = GEP(indices, C(8));
191 #endif
192 if(fetchState.bDisableIndexOOBCheck)
193 {
194 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
195 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
196 #if USE_SIMD16_SHADERS
197 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
198 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
199 #endif
200 }
201 else
202 {
203 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
204 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
205 #if USE_SIMD16_SHADERS
206 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
207 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
208 #endif
209 }
210 break;
211 case R32_UINT:
212 #if USE_SIMD16_SHADERS
213 indices2 = GEP(indices, C(8));
214 #endif
215 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
216 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
217 #if USE_SIMD16_SHADERS
218 (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
219 : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
220 #endif
221 break; // incoming type is already 32bit int
222 default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
223 }
224
225 Value* vVertexId = vIndices;
226 #if USE_SIMD16_SHADERS
227 Value* vVertexId2 = vIndices2;
228 #endif
229 if (fetchState.bVertexIDOffsetEnable)
230 {
231 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
232 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
233 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
234 vVertexId = ADD(vIndices, vBaseVertex);
235 vVertexId = ADD(vVertexId, vStartVertex);
236 #if USE_SIMD16_SHADERS
237 vVertexId2 = ADD(vIndices2, vBaseVertex);
238 vVertexId2 = ADD(vVertexId2, vStartVertex);
239 #endif
240 }
241
242 // store out vertex IDs
243 STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
244 #if USE_SIMD16_SHADERS
245 STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
246 #endif
247
248 // store out cut mask if enabled
249 if (fetchState.bEnableCutIndex)
250 {
251 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
252 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
253 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
254 #if USE_SIMD16_SHADERS
255 Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
256 STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
257 #endif
258 }
259
260 // Fetch attributes from memory and output to a simdvertex struct
261 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
262 #if USE_SIMD16_SHADERS
263 if (fetchState.bDisableVGATHER)
264 {
265 JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
266 JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
267 }
268 else
269 {
270 JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
271 JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
272 }
273 #else
274 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
275 : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
276 #endif
277
278 RET_VOID();
279
280 JitManager::DumpToFile(fetch, "src");
281
282 #if defined(_DEBUG)
283 verifyFunction(*fetch);
284 #endif
285
286 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
287
288 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
289 setupPasses.add(createBreakCriticalEdgesPass());
290 setupPasses.add(createCFGSimplificationPass());
291 setupPasses.add(createEarlyCSEPass());
292 setupPasses.add(createPromoteMemoryToRegisterPass());
293
294 setupPasses.run(*fetch);
295
296 JitManager::DumpToFile(fetch, "se");
297
298 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
299
300 ///@todo Haven't touched these either. Need to remove some of these and add others.
301 optPasses.add(createCFGSimplificationPass());
302 optPasses.add(createEarlyCSEPass());
303 optPasses.add(createInstructionCombiningPass());
304 optPasses.add(createInstructionSimplifierPass());
305 optPasses.add(createConstantPropagationPass());
306 optPasses.add(createSCCPPass());
307 optPasses.add(createAggressiveDCEPass());
308
309 optPasses.run(*fetch);
310 optPasses.run(*fetch);
311
312 JitManager::DumpToFile(fetch, "opt");
313
314 return fetch;
315 }
316
317 //////////////////////////////////////////////////////////////////////////
318 /// @brief Loads attributes from memory using LOADs, shuffling the
319 /// components into SOA form.
320 /// *Note* currently does not support component control,
321 /// component packing, instancing
322 /// @param fetchState - info about attributes to be fetched from memory
323 /// @param streams - value pointer to the current vertex stream
324 /// @param vIndices - vector value of indices to load
325 /// @param pVtxOut - value pointer to output simdvertex struct
326 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
327 {
328 // Zack shuffles; a variant of the Charleston.
329
330 std::vector<Value*> vectors(16);
331 std::vector<Constant*> pMask(mVWidth);
332 for(uint32_t i = 0; i < mVWidth; ++i)
333 {
334 pMask[i] = (C(i < 4 ? i : 4));
335 }
336 Constant* promoteMask = ConstantVector::get(pMask);
337 Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
338
339 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
340 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
341 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
342 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
343 curInstance->setName("curInstance");
344
345 for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
346 {
347 Value* elements[4] = {0};
348 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
349 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
350 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
351 uint32_t numComponents = info.numComps;
352 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
353
354 // load path doesn't support component packing
355 SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
356
357 vectors.clear();
358
359 Value *vCurIndices;
360 Value *startOffset;
361 if(ied.InstanceEnable)
362 {
363 Value* stepRate = C(ied.InstanceDataStepRate);
364
365 // prevent a div by 0 for 0 step rate
366 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
367 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
368
369 // calc the current offset into instanced data buffer
370 Value* calcInstance = UDIV(curInstance, stepRate);
371
372 // if step rate is 0, every instance gets instance 0
373 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
374
375 vCurIndices = VBROADCAST(calcInstance);
376
377 startOffset = startInstance;
378 }
379 else
380 {
381 // offset indices by baseVertex
382 vCurIndices = ADD(vIndices, vBaseVertex);
383
384 startOffset = startVertex;
385 }
386
387 // load SWR_VERTEX_BUFFER_STATE::pData
388 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
389
390 // load SWR_VERTEX_BUFFER_STATE::pitch
391 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
392 stride = Z_EXT(stride, mInt64Ty);
393
394 // load SWR_VERTEX_BUFFER_STATE::size
395 Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
396 size = Z_EXT(size, mInt64Ty);
397
398 Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
399
400 Value *minVertex = NULL;
401 Value *minVertexOffset = NULL;
402 if (fetchState.bPartialVertexBuffer) {
403 // fetch min index for low bounds checking
404 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
405 minVertex = LOAD(minVertex);
406 if (!fetchState.bDisableIndexOOBCheck) {
407 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
408 }
409 }
410
411 // Load from the stream.
412 for(uint32_t lane = 0; lane < mVWidth; ++lane)
413 {
414 // Get index
415 Value* index = VEXTRACT(vCurIndices, C(lane));
416
417 if (fetchState.bPartialVertexBuffer) {
418 // clamp below minvertex
419 Value *isBelowMin = ICMP_SLT(index, minVertex);
420 index = SELECT(isBelowMin, minVertex, index);
421 }
422
423 index = Z_EXT(index, mInt64Ty);
424
425 Value* offset = MUL(index, stride);
426 offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
427 offset = ADD(offset, startVertexOffset);
428
429 if (!fetchState.bDisableIndexOOBCheck) {
430 // check for out of bound access, including partial OOB, and replace them with minVertex
431 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
432 Value *oob = ICMP_ULE(endOffset, size);
433 if (fetchState.bPartialVertexBuffer) {
434 offset = SELECT(oob, offset, minVertexOffset);
435 } else {
436 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
437 }
438 }
439
440 Value* pointer = GEP(stream, offset);
441 // We use a full-lane, but don't actually care.
442 Value* vptr = 0;
443
444 // get a pointer to a 4 component attrib in default address space
445 switch(bpc)
446 {
447 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
448 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
449 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
450 default: SWR_INVALID("Unsupported underlying bpp!");
451 }
452
453 // load 4 components of attribute
454 Value* vec = ALIGNED_LOAD(vptr, 1, false);
455
456 // Convert To FP32 internally
457 switch(info.type[0])
458 {
459 case SWR_TYPE_UNORM:
460 switch(bpc)
461 {
462 case 8:
463 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
464 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
465 break;
466 case 16:
467 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
468 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
469 break;
470 default:
471 SWR_INVALID("Unsupported underlying type!");
472 break;
473 }
474 break;
475 case SWR_TYPE_SNORM:
476 switch(bpc)
477 {
478 case 8:
479 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
480 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
481 break;
482 case 16:
483 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
484 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
485 break;
486 default:
487 SWR_INVALID("Unsupported underlying type!");
488 break;
489 }
490 break;
491 case SWR_TYPE_UINT:
492 // Zero extend uint32_t types.
493 switch(bpc)
494 {
495 case 8:
496 case 16:
497 vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
498 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
499 break;
500 case 32:
501 break; // Pass through unchanged.
502 default:
503 SWR_INVALID("Unsupported underlying type!");
504 break;
505 }
506 break;
507 case SWR_TYPE_SINT:
508 // Sign extend SINT types.
509 switch(bpc)
510 {
511 case 8:
512 case 16:
513 vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
514 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
515 break;
516 case 32:
517 break; // Pass through unchanged.
518 default:
519 SWR_INVALID("Unsupported underlying type!");
520 break;
521 }
522 break;
523 case SWR_TYPE_FLOAT:
524 switch(bpc)
525 {
526 case 32:
527 break; // Pass through unchanged.
528 default:
529 SWR_INVALID("Unsupported underlying type!");
530 }
531 break;
532 case SWR_TYPE_USCALED:
533 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
534 break;
535 case SWR_TYPE_SSCALED:
536 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
537 break;
538 case SWR_TYPE_SFIXED:
539 vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
540 break;
541 case SWR_TYPE_UNKNOWN:
542 case SWR_TYPE_UNUSED:
543 SWR_INVALID("Unsupported type %d!", info.type[0]);
544 }
545
546 // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
547 // uwvec: 4 x F32, undef value
548 Value* wvec = VSHUFFLE(vec, uwvec, promoteMask);
549 vectors.push_back(wvec);
550 }
551
552 std::vector<Constant*> v01Mask(mVWidth);
553 std::vector<Constant*> v23Mask(mVWidth);
554 std::vector<Constant*> v02Mask(mVWidth);
555 std::vector<Constant*> v13Mask(mVWidth);
556
557 // Concatenate the vectors together.
558 elements[0] = VUNDEF_F();
559 elements[1] = VUNDEF_F();
560 elements[2] = VUNDEF_F();
561 elements[3] = VUNDEF_F();
562 for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
563 {
564 v01Mask[4 * b + 0] = C(0 + 4 * b);
565 v01Mask[4 * b + 1] = C(1 + 4 * b);
566 v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
567 v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
568
569 v23Mask[4 * b + 0] = C(2 + 4 * b);
570 v23Mask[4 * b + 1] = C(3 + 4 * b);
571 v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
572 v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
573
574 v02Mask[4 * b + 0] = C(0 + 4 * b);
575 v02Mask[4 * b + 1] = C(2 + 4 * b);
576 v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
577 v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
578
579 v13Mask[4 * b + 0] = C(1 + 4 * b);
580 v13Mask[4 * b + 1] = C(3 + 4 * b);
581 v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
582 v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
583
584 std::vector<Constant*> iMask(mVWidth);
585 for(uint32_t i = 0; i < mVWidth; ++i)
586 {
587 if(((4 * b) <= i) && (i < (4 * (b + 1))))
588 {
589 iMask[i] = C(i % 4 + mVWidth);
590 }
591 else
592 {
593 iMask[i] = C(i);
594 }
595 }
596 Constant* insertMask = ConstantVector::get(iMask);
597 elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
598 elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
599 elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
600 elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
601 }
602
603 Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
604 Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
605 Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
606 Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
607 elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
608 elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
609 elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
610 elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
611
612 switch(numComponents + 1)
613 {
614 case 1: elements[0] = VIMMED1(0.0f);
615 case 2: elements[1] = VIMMED1(0.0f);
616 case 3: elements[2] = VIMMED1(0.0f);
617 case 4: elements[3] = VIMMED1(1.0f);
618 }
619
620 for(uint32_t c = 0; c < 4; ++c)
621 {
622 #if USE_SIMD16_SHADERS
623 Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
624 #else
625 Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
626 #endif
627 STORE(elements[c], dest);
628 }
629 }
630 }
631
632 // returns true for odd formats that require special state.gather handling
633 bool FetchJit::IsOddFormat(SWR_FORMAT format)
634 {
635 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
636 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
637 {
638 return true;
639 }
640 return false;
641 }
642
643 // format is uniform if all components are the same size and type
644 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
645 {
646 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
647 uint32_t bpc0 = info.bpc[0];
648 uint32_t type0 = info.type[0];
649
650 for (uint32_t c = 1; c < info.numComps; ++c)
651 {
652 if (bpc0 != info.bpc[c] || type0 != info.type[c])
653 {
654 return false;
655 }
656 }
657 return true;
658 }
659
660 // unpacks components based on format
661 // foreach component in the pixel
662 // mask off everything but this component
663 // shift component to LSB
664 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
665 {
666 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
667
668 uint32_t bitOffset = 0;
669 for (uint32_t c = 0; c < info.numComps; ++c)
670 {
671 uint32_t swizzledIndex = info.swizzle[c];
672 uint32_t compBits = info.bpc[c];
673 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
674 Value* comp = AND(vInput, bitmask);
675 comp = LSHR(comp, bitOffset);
676
677 result[swizzledIndex] = comp;
678 bitOffset += compBits;
679 }
680 }
681
682 // gather for odd component size formats
683 // gather SIMD full pixels per lane then shift/mask to move each component to their
684 // own vector
685 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
686 {
687 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
688
689 // only works if pixel size is <= 32bits
690 SWR_ASSERT(info.bpp <= 32);
691
692 Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask, C((char)1));
693
694 for (uint32_t comp = 0; comp < 4; ++comp)
695 {
696 pResult[comp] = VIMMED1((int)info.defaults[comp]);
697 }
698
699 UnpackComponents(format, pGather, pResult);
700
701 // cast to fp32
702 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
703 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
704 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
705 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
706 }
707
708 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
709 {
710 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
711
712 for (uint32_t c = 0; c < info.numComps; ++c)
713 {
714 uint32_t compIndex = info.swizzle[c];
715
716 // skip any conversion on UNUSED components
717 if (info.type[c] == SWR_TYPE_UNUSED)
718 {
719 continue;
720 }
721
722 if (info.isNormalized[c])
723 {
724 if (info.type[c] == SWR_TYPE_SNORM)
725 {
726 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
727
728 /// result = c * (1.0f / (2^(n-1) - 1);
729 uint32_t n = info.bpc[c];
730 uint32_t pow2 = 1 << (n - 1);
731 float scale = 1.0f / (float)(pow2 - 1);
732 Value *vScale = VIMMED1(scale);
733 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
734 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
735 texels[compIndex] = FMUL(texels[compIndex], vScale);
736 }
737 else
738 {
739 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
740
741 /// result = c * (1.0f / (2^n - 1))
742 uint32_t n = info.bpc[c];
743 uint32_t pow2 = 1 << n;
744 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
745 if (n == 24)
746 {
747 float scale = (float)(pow2 - 1);
748 Value* vScale = VIMMED1(scale);
749 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
750 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
751 texels[compIndex] = FDIV(texels[compIndex], vScale);
752 }
753 else
754 {
755 float scale = 1.0f / (float)(pow2 - 1);
756 Value *vScale = VIMMED1(scale);
757 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
758 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
759 texels[compIndex] = FMUL(texels[compIndex], vScale);
760 }
761 }
762 continue;
763 }
764 }
765 }
766
767 //////////////////////////////////////////////////////////////////////////
768 /// @brief Loads attributes from memory using AVX2 GATHER(s)
769 /// @param fetchState - info about attributes to be fetched from memory
770 /// @param streams - value pointer to the current vertex stream
771 /// @param vIndices - vector value of indices to gather
772 /// @param pVtxOut - value pointer to output simdvertex struct
773 #if USE_SIMD16_SHADERS
774 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
775 Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
776 #else
777 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
778 Value* streams, Value* vIndices, Value* pVtxOut)
779 #endif
780 {
781 uint32_t currentVertexElement = 0;
782 uint32_t outputElt = 0;
783 Value* vVertexElements[4];
784
785 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
786 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
787 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
788 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
789 curInstance->setName("curInstance");
790
791 for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
792 {
793 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
794
795 // skip element if all components are disabled
796 if (ied.ComponentPacking == ComponentEnable::NONE)
797 {
798 continue;
799 }
800
801 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
802 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
803 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
804
805 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
806
807 // VGATHER* takes an *i8 src pointer
808 Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
809
810 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
811 Value *vStride = VBROADCAST(stride);
812
813 // max vertex index that is fully in bounds
814 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
815 maxVertex = LOAD(maxVertex);
816
817 Value *minVertex = NULL;
818 if (fetchState.bPartialVertexBuffer) {
819 // min vertex index for low bounds OOB checking
820 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
821 minVertex = LOAD(minVertex);
822 }
823
824 Value *vCurIndices;
825 Value *startOffset;
826 if(ied.InstanceEnable)
827 {
828 Value* stepRate = C(ied.InstanceDataStepRate);
829
830 // prevent a div by 0 for 0 step rate
831 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
832 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
833
834 // calc the current offset into instanced data buffer
835 Value* calcInstance = UDIV(curInstance, stepRate);
836
837 // if step rate is 0, every instance gets instance 0
838 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
839
840 vCurIndices = VBROADCAST(calcInstance);
841
842 startOffset = startInstance;
843 }
844 else
845 {
846 // offset indices by baseVertex
847 vCurIndices = ADD(vIndices, vBaseVertex);
848
849 startOffset = startVertex;
850 }
851
852 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
853 // do 64bit address offset calculations.
854
855 // calculate byte offset to the start of the VB
856 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
857 pStreamBase = GEP(pStreamBase, baseOffset);
858
859 // if we have a start offset, subtract from max vertex. Used for OOB check
860 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
861 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
862 // if we have a negative value, we're already OOB. clamp at 0.
863 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
864
865 if (fetchState.bPartialVertexBuffer) {
866 // similary for min vertex
867 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
868 Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
869 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
870 }
871
872 // Load the in bounds size of a partially valid vertex
873 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
874 partialInboundsSize = LOAD(partialInboundsSize);
875 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
876 Value* vBpp = VBROADCAST(C(info.Bpp));
877 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
878
879 // is the element is <= the partially valid size
880 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
881
882 // override cur indices with 0 if pitch is 0
883 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
884 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
885
886 // are vertices partially OOB?
887 Value* vMaxVertex = VBROADCAST(maxVertex);
888 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
889
890 // are vertices fully in bounds?
891 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
892
893 Value *vGatherMask;
894 if (fetchState.bPartialVertexBuffer) {
895 // are vertices below minVertex limit?
896 Value *vMinVertex = VBROADCAST(minVertex);
897 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
898
899 // only fetch lanes that pass both tests
900 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
901 } else {
902 vGatherMask = vMaxGatherMask;
903 }
904
905 // blend in any partially OOB indices that have valid elements
906 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
907 Value* pMask = vGatherMask;
908 vGatherMask = VMASK(vGatherMask);
909
910 // calculate the actual offsets into the VB
911 Value* vOffsets = MUL(vCurIndices, vStride);
912 vOffsets = ADD(vOffsets, vAlignmentOffsets);
913
914 // Packing and component control
915 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
916 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
917 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
918
919 // Special gather/conversion for formats without equal component sizes
920 if (IsOddFormat((SWR_FORMAT)ied.Format))
921 {
922 Value* pResults[4];
923 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
924 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
925
926 for (uint32_t c = 0; c < 4; ++c)
927 {
928 if (isComponentEnabled(compMask, c))
929 {
930 vVertexElements[currentVertexElement++] = pResults[c];
931 if (currentVertexElement > 3)
932 {
933 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
934 // reset to the next vVertexElement to output
935 currentVertexElement = 0;
936 }
937 }
938 }
939 }
940 else if(info.type[0] == SWR_TYPE_FLOAT)
941 {
942 ///@todo: support 64 bit vb accesses
943 Value* gatherSrc = VIMMED1(0.0f);
944
945 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
946 "Unsupported format for standard gather fetch.");
947
948 // Gather components from memory to store in a simdvertex structure
949 switch(bpc)
950 {
951 case 16:
952 {
953 Value* vGatherResult[2];
954 Value *vMask;
955
956 // if we have at least one component out of x or y to fetch
957 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
958 // save mask as it is zero'd out after each gather
959 vMask = vGatherMask;
960
961 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
962 // e.g. result of first 8x32bit integer gather for 16bit components
963 // 256i - 0 1 2 3 4 5 6 7
964 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
965 //
966 }
967
968 // if we have at least one component out of z or w to fetch
969 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
970 // offset base to the next components(zw) in the vertex to gather
971 pStreamBase = GEP(pStreamBase, C((char)4));
972 vMask = vGatherMask;
973
974 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
975 // e.g. result of second 8x32bit integer gather for 16bit components
976 // 256i - 0 1 2 3 4 5 6 7
977 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
978 //
979 }
980
981 // if we have at least one component to shuffle into place
982 if(compMask){
983 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
984 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
985
986 // Shuffle gathered components into place in simdvertex struct
987 #if USE_SIMD16_SHADERS
988 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
989 #else
990 Shuffle16bpcGather(args); // outputs to vVertexElements ref
991 #endif
992 }
993 }
994 break;
995 case 32:
996 {
997 for (uint32_t i = 0; i < 4; i++)
998 {
999 if (isComponentEnabled(compMask, i))
1000 {
1001 // if we need to gather the component
1002 if (compCtrl[i] == StoreSrc)
1003 {
1004 // save mask as it is zero'd out after each gather
1005 Value *vMask = vGatherMask;
1006
1007 // Gather a SIMD of vertices
1008 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1009 }
1010 else
1011 {
1012 #if USE_SIMD16_SHADERS
1013 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1014 #else
1015 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1016 #endif
1017 }
1018
1019 if (currentVertexElement > 3)
1020 {
1021 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1022 // reset to the next vVertexElement to output
1023 currentVertexElement = 0;
1024 }
1025
1026 }
1027
1028 // offset base to the next component in the vertex to gather
1029 pStreamBase = GEP(pStreamBase, C((char)4));
1030 }
1031 }
1032 break;
1033 case 64:
1034 {
1035 for (uint32_t i = 0; i < 4; i++)
1036 {
1037 if (isComponentEnabled(compMask, i))
1038 {
1039 // if we need to gather the component
1040 if (compCtrl[i] == StoreSrc)
1041 {
1042 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1043 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1044 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1045 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1046 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1047 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1048
1049 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1050 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1051
1052 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1053
1054 Value* pGatherLo = GATHERPD(vZeroDouble,
1055 pStreamBase, vOffsetsLo, vMaskLo, C((char)1));
1056 Value* pGatherHi = GATHERPD(vZeroDouble,
1057 pStreamBase, vOffsetsHi, vMaskHi, C((char)1));
1058
1059 pGatherLo = VCVTPD2PS(pGatherLo);
1060 pGatherHi = VCVTPD2PS(pGatherHi);
1061
1062 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1063
1064 vVertexElements[currentVertexElement++] = pGather;
1065 }
1066 else
1067 {
1068 #if USE_SIMD16_SHADERS
1069 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1070 #else
1071 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1072 #endif
1073 }
1074
1075 if (currentVertexElement > 3)
1076 {
1077 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1078 // reset to the next vVertexElement to output
1079 currentVertexElement = 0;
1080 }
1081
1082 }
1083
1084 // offset base to the next component in the vertex to gather
1085 pStreamBase = GEP(pStreamBase, C((char)8));
1086 }
1087 }
1088 break;
1089 default:
1090 SWR_INVALID("Tried to fetch invalid FP format");
1091 break;
1092 }
1093 }
1094 else
1095 {
1096 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1097 ConversionType conversionType = CONVERT_NONE;
1098
1099 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1100 "Unsupported format for standard gather fetch.");
1101
1102 switch(info.type[0])
1103 {
1104 case SWR_TYPE_UNORM:
1105 conversionType = CONVERT_NORMALIZED;
1106 case SWR_TYPE_UINT:
1107 extendCastType = Instruction::CastOps::ZExt;
1108 break;
1109 case SWR_TYPE_SNORM:
1110 conversionType = CONVERT_NORMALIZED;
1111 case SWR_TYPE_SINT:
1112 extendCastType = Instruction::CastOps::SExt;
1113 break;
1114 case SWR_TYPE_USCALED:
1115 conversionType = CONVERT_USCALED;
1116 extendCastType = Instruction::CastOps::UIToFP;
1117 break;
1118 case SWR_TYPE_SSCALED:
1119 conversionType = CONVERT_SSCALED;
1120 extendCastType = Instruction::CastOps::SIToFP;
1121 break;
1122 case SWR_TYPE_SFIXED:
1123 conversionType = CONVERT_SFIXED;
1124 extendCastType = Instruction::CastOps::SExt;
1125 break;
1126 default:
1127 break;
1128 }
1129
1130 // value substituted when component of gather is masked
1131 Value* gatherSrc = VIMMED1(0);
1132
1133 // Gather components from memory to store in a simdvertex structure
1134 switch (bpc)
1135 {
1136 case 8:
1137 {
1138 // if we have at least one component to fetch
1139 if(compMask)
1140 {
1141 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
1142 // e.g. result of an 8x32bit integer gather for 8bit components
1143 // 256i - 0 1 2 3 4 5 6 7
1144 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1145
1146 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1147 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1148
1149 // Shuffle gathered components into place in simdvertex struct
1150 #if USE_SIMD16_SHADERS
1151 Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1152 #else
1153 Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1154 #endif
1155 }
1156 }
1157 break;
1158 case 16:
1159 {
1160 Value* vGatherResult[2];
1161 Value *vMask;
1162
1163 // if we have at least one component out of x or y to fetch
1164 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1165 // save mask as it is zero'd out after each gather
1166 vMask = vGatherMask;
1167
1168 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1169 // e.g. result of first 8x32bit integer gather for 16bit components
1170 // 256i - 0 1 2 3 4 5 6 7
1171 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1172 //
1173 }
1174
1175 // if we have at least one component out of z or w to fetch
1176 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1177 // offset base to the next components(zw) in the vertex to gather
1178 pStreamBase = GEP(pStreamBase, C((char)4));
1179 vMask = vGatherMask;
1180
1181 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1182 // e.g. result of second 8x32bit integer gather for 16bit components
1183 // 256i - 0 1 2 3 4 5 6 7
1184 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1185 //
1186 }
1187
1188 // if we have at least one component to shuffle into place
1189 if(compMask){
1190 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1191 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1192
1193 // Shuffle gathered components into place in simdvertex struct
1194 #if USE_SIMD16_SHADERS
1195 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
1196 #else
1197 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1198 #endif
1199 }
1200 }
1201 break;
1202 case 32:
1203 {
1204 // Gathered components into place in simdvertex struct
1205 for (uint32_t i = 0; i < 4; i++)
1206 {
1207 if (isComponentEnabled(compMask, i))
1208 {
1209 // if we need to gather the component
1210 if (compCtrl[i] == StoreSrc)
1211 {
1212 // save mask as it is zero'd out after each gather
1213 Value *vMask = vGatherMask;
1214
1215 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1216
1217 if (conversionType == CONVERT_USCALED)
1218 {
1219 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1220 }
1221 else if (conversionType == CONVERT_SSCALED)
1222 {
1223 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1224 }
1225 else if (conversionType == CONVERT_SFIXED)
1226 {
1227 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1228 }
1229
1230 vVertexElements[currentVertexElement++] = pGather;
1231 // e.g. result of a single 8x32bit integer gather for 32bit components
1232 // 256i - 0 1 2 3 4 5 6 7
1233 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1234 }
1235 else
1236 {
1237 #if USE_SIMD16_SHADERS
1238 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1239 #else
1240 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1241 #endif
1242 }
1243
1244 if (currentVertexElement > 3)
1245 {
1246 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1247 // reset to the next vVertexElement to output
1248 currentVertexElement = 0;
1249 }
1250
1251 }
1252
1253 // offset base to the next component in the vertex to gather
1254 pStreamBase = GEP(pStreamBase, C((char)4));
1255 }
1256 }
1257 break;
1258 }
1259 }
1260 }
1261
1262 // if we have a partially filled vVertexElement struct, output it
1263 if(currentVertexElement > 0){
1264 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1265 }
1266 }
1267
1268 //////////////////////////////////////////////////////////////////////////
1269 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1270 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1271 /// support
1272 /// @param pIndices - pointer to 8 bit indices
1273 /// @param pLastIndex - pointer to last valid index
1274 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1275 {
1276 // can fit 2 16 bit integers per vWidth lane
1277 Value* vIndices = VUNDEF_I();
1278
1279 // store 0 index on stack to be used to conditionally load from if index address is OOB
1280 Value* pZeroIndex = ALLOCA(mInt8Ty);
1281 STORE(C((uint8_t)0), pZeroIndex);
1282
1283 // Load a SIMD of index pointers
1284 for(int64_t lane = 0; lane < mVWidth; lane++)
1285 {
1286 // Calculate the address of the requested index
1287 Value *pIndex = GEP(pIndices, C(lane));
1288
1289 // check if the address is less than the max index,
1290 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1291
1292 // if valid, load the index. if not, load 0 from the stack
1293 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1294 Value *index = LOAD(pValid, "valid index");
1295
1296 // zero extended index to 32 bits and insert into the correct simd lane
1297 index = Z_EXT(index, mInt32Ty);
1298 vIndices = VINSERT(vIndices, index, lane);
1299 }
1300 return vIndices;
1301 }
1302
1303 //////////////////////////////////////////////////////////////////////////
1304 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1305 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1306 /// support
1307 /// @param pIndices - pointer to 16 bit indices
1308 /// @param pLastIndex - pointer to last valid index
1309 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1310 {
1311 // can fit 2 16 bit integers per vWidth lane
1312 Value* vIndices = VUNDEF_I();
1313
1314 // store 0 index on stack to be used to conditionally load from if index address is OOB
1315 Value* pZeroIndex = ALLOCA(mInt16Ty);
1316 STORE(C((uint16_t)0), pZeroIndex);
1317
1318 // Load a SIMD of index pointers
1319 for(int64_t lane = 0; lane < mVWidth; lane++)
1320 {
1321 // Calculate the address of the requested index
1322 Value *pIndex = GEP(pIndices, C(lane));
1323
1324 // check if the address is less than the max index,
1325 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1326
1327 // if valid, load the index. if not, load 0 from the stack
1328 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1329 Value *index = LOAD(pValid, "valid index");
1330
1331 // zero extended index to 32 bits and insert into the correct simd lane
1332 index = Z_EXT(index, mInt32Ty);
1333 vIndices = VINSERT(vIndices, index, lane);
1334 }
1335 return vIndices;
1336 }
1337
1338 //////////////////////////////////////////////////////////////////////////
1339 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1340 /// @param pIndices - pointer to 32 bit indices
1341 /// @param pLastIndex - pointer to last valid index
1342 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1343 {
1344 DataLayout dL(JM()->mpCurrentModule);
1345 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1346 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1347 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1348
1349 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1350 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1351 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1352 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1353
1354 // create a vector of index counts from the base index ptr passed into the fetch
1355 const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1356 Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1357
1358 // compare index count to the max valid index
1359 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1360 // vIndexOffsets 0 1 2 3 4 5 6 7
1361 // ------------------------------
1362 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1363 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1364 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1365 Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1366
1367 // VMASKLOAD takes an *i8 src pointer
1368 pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1369
1370 // Load the indices; OOB loads 0
1371 return MASKLOADD(pIndices,vIndexMask);
1372 }
1373
1374 //////////////////////////////////////////////////////////////////////////
1375 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1376 /// denormalizes if needed, converts to F32 if needed, and positions in
1377 // the proper SIMD rows to be output to the simdvertex structure
1378 /// @param args: (tuple of args, listed below)
1379 /// @param vGatherResult - 8 gathered 8bpc vertices
1380 /// @param pVtxOut - base pointer to output simdvertex struct
1381 /// @param extendType - sign extend or zero extend
1382 /// @param bNormalized - do we need to denormalize?
1383 /// @param currentVertexElement - reference to the current vVertexElement
1384 /// @param outputElt - reference to the current offset from simdvertex we're o
1385 /// @param compMask - component packing mask
1386 /// @param compCtrl - component control val
1387 /// @param vVertexElements[4] - vertex components to output
1388 /// @param swizzle[4] - component swizzle location
1389 #if USE_SIMD16_SHADERS
1390 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
1391 #else
1392 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1393 #endif
1394 {
1395 // Unpack tuple args
1396 Value*& vGatherResult = std::get<0>(args);
1397 Value* pVtxOut = std::get<1>(args);
1398 const Instruction::CastOps extendType = std::get<2>(args);
1399 const ConversionType conversionType = std::get<3>(args);
1400 uint32_t &currentVertexElement = std::get<4>(args);
1401 uint32_t &outputElt = std::get<5>(args);
1402 const ComponentEnable compMask = std::get<6>(args);
1403 const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1404 Value* (&vVertexElements)[4] = std::get<8>(args);
1405 const uint32_t (&swizzle)[4] = std::get<9>(args);
1406
1407 // cast types
1408 Type* vGatherTy = mSimdInt32Ty;
1409 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1410
1411 // have to do extra work for sign extending
1412 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1413 Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1414 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1415
1416 // shuffle mask, including any swizzling
1417 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1418 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1419 Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1420 char(y), char(y+4), char(y+8), char(y+12),
1421 char(z), char(z+4), char(z+8), char(z+12),
1422 char(w), char(w+4), char(w+8), char(w+12),
1423 char(x), char(x+4), char(x+8), char(x+12),
1424 char(y), char(y+4), char(y+8), char(y+12),
1425 char(z), char(z+4), char(z+8), char(z+12),
1426 char(w), char(w+4), char(w+8), char(w+12)});
1427
1428 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1429 // after pshufb: group components together in each 128bit lane
1430 // 256i - 0 1 2 3 4 5 6 7
1431 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1432
1433 Value* vi128XY = nullptr;
1434 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1435 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1436 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1437 // 256i - 0 1 2 3 4 5 6 7
1438 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1439 }
1440
1441 // do the same for zw components
1442 Value* vi128ZW = nullptr;
1443 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1444 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1445 }
1446
1447 // init denormalize variables if needed
1448 Instruction::CastOps fpCast;
1449 Value* conversionFactor;
1450
1451 switch (conversionType)
1452 {
1453 case CONVERT_NORMALIZED:
1454 fpCast = Instruction::CastOps::SIToFP;
1455 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1456 break;
1457 case CONVERT_SSCALED:
1458 fpCast = Instruction::CastOps::SIToFP;
1459 conversionFactor = VIMMED1((float)(1.0));
1460 break;
1461 case CONVERT_USCALED:
1462 SWR_INVALID("Type should not be sign extended!");
1463 conversionFactor = nullptr;
1464 break;
1465 default:
1466 SWR_ASSERT(conversionType == CONVERT_NONE);
1467 conversionFactor = nullptr;
1468 break;
1469 }
1470
1471 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1472 for (uint32_t i = 0; i < 4; i++)
1473 {
1474 if (isComponentEnabled(compMask, i))
1475 {
1476 if (compCtrl[i] == ComponentControl::StoreSrc)
1477 {
1478 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1479 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1480 // if x or y, use vi128XY permute result, else use vi128ZW
1481 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1482
1483 // sign extend
1484 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1485
1486 // denormalize if needed
1487 if (conversionType != CONVERT_NONE)
1488 {
1489 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1490 }
1491 currentVertexElement++;
1492 }
1493 else
1494 {
1495 #if USE_SIMD16_SHADERS
1496 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1497 #else
1498 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1499 #endif
1500 }
1501
1502 if (currentVertexElement > 3)
1503 {
1504 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1505 // reset to the next vVertexElement to output
1506 currentVertexElement = 0;
1507 }
1508 }
1509 }
1510 }
1511 // else zero extend
1512 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1513 {
1514 // init denormalize variables if needed
1515 Instruction::CastOps fpCast;
1516 Value* conversionFactor;
1517
1518 switch (conversionType)
1519 {
1520 case CONVERT_NORMALIZED:
1521 fpCast = Instruction::CastOps::UIToFP;
1522 conversionFactor = VIMMED1((float)(1.0 / 255.0));
1523 break;
1524 case CONVERT_USCALED:
1525 fpCast = Instruction::CastOps::UIToFP;
1526 conversionFactor = VIMMED1((float)(1.0));
1527 break;
1528 case CONVERT_SSCALED:
1529 SWR_INVALID("Type should not be zero extended!");
1530 conversionFactor = nullptr;
1531 break;
1532 default:
1533 SWR_ASSERT(conversionType == CONVERT_NONE);
1534 conversionFactor = nullptr;
1535 break;
1536 }
1537
1538 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1539 for (uint32_t i = 0; i < 4; i++)
1540 {
1541 if (isComponentEnabled(compMask, i))
1542 {
1543 if (compCtrl[i] == ComponentControl::StoreSrc)
1544 {
1545 // pshufb masks for each component
1546 Value* vConstMask;
1547 switch (swizzle[i])
1548 {
1549 case 0:
1550 // x shuffle mask
1551 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1552 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1553 break;
1554 case 1:
1555 // y shuffle mask
1556 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1557 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1558 break;
1559 case 2:
1560 // z shuffle mask
1561 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1562 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1563 break;
1564 case 3:
1565 // w shuffle mask
1566 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1567 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1568 break;
1569 default:
1570 vConstMask = nullptr;
1571 break;
1572 }
1573
1574 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1575 // after pshufb for x channel
1576 // 256i - 0 1 2 3 4 5 6 7
1577 // x000 x000 x000 x000 x000 x000 x000 x000
1578
1579 // denormalize if needed
1580 if (conversionType != CONVERT_NONE)
1581 {
1582 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1583 }
1584 currentVertexElement++;
1585 }
1586 else
1587 {
1588 #if USE_SIMD16_SHADERS
1589 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1590 #else
1591 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1592 #endif
1593 }
1594
1595 if (currentVertexElement > 3)
1596 {
1597 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1598 // reset to the next vVertexElement to output
1599 currentVertexElement = 0;
1600 }
1601 }
1602 }
1603 }
1604 else
1605 {
1606 SWR_INVALID("Unsupported conversion type");
1607 }
1608 }
1609
1610 //////////////////////////////////////////////////////////////////////////
1611 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1612 /// denormalizes if needed, converts to F32 if needed, and positions in
1613 // the proper SIMD rows to be output to the simdvertex structure
1614 /// @param args: (tuple of args, listed below)
1615 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1616 /// @param pVtxOut - base pointer to output simdvertex struct
1617 /// @param extendType - sign extend or zero extend
1618 /// @param bNormalized - do we need to denormalize?
1619 /// @param currentVertexElement - reference to the current vVertexElement
1620 /// @param outputElt - reference to the current offset from simdvertex we're o
1621 /// @param compMask - component packing mask
1622 /// @param compCtrl - component control val
1623 /// @param vVertexElements[4] - vertex components to output
1624 #if USE_SIMD16_SHADERS
1625 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
1626 #else
1627 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1628 #endif
1629 {
1630 // Unpack tuple args
1631 Value* (&vGatherResult)[2] = std::get<0>(args);
1632 Value* pVtxOut = std::get<1>(args);
1633 const Instruction::CastOps extendType = std::get<2>(args);
1634 const ConversionType conversionType = std::get<3>(args);
1635 uint32_t &currentVertexElement = std::get<4>(args);
1636 uint32_t &outputElt = std::get<5>(args);
1637 const ComponentEnable compMask = std::get<6>(args);
1638 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1639 Value* (&vVertexElements)[4] = std::get<8>(args);
1640
1641 // cast types
1642 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1643 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1644
1645 // have to do extra work for sign extending
1646 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1647 (extendType == Instruction::CastOps::FPExt))
1648 {
1649 // is this PP float?
1650 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1651
1652 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1653 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1654
1655 // shuffle mask
1656 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1657 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1658 Value* vi128XY = nullptr;
1659 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1660 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1661 // after pshufb: group components together in each 128bit lane
1662 // 256i - 0 1 2 3 4 5 6 7
1663 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1664
1665 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1666 // after PERMD: move and pack xy components into each 128bit lane
1667 // 256i - 0 1 2 3 4 5 6 7
1668 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1669 }
1670
1671 // do the same for zw components
1672 Value* vi128ZW = nullptr;
1673 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1674 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1675 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1676 }
1677
1678 // init denormalize variables if needed
1679 Instruction::CastOps IntToFpCast;
1680 Value* conversionFactor;
1681
1682 switch (conversionType)
1683 {
1684 case CONVERT_NORMALIZED:
1685 IntToFpCast = Instruction::CastOps::SIToFP;
1686 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1687 break;
1688 case CONVERT_SSCALED:
1689 IntToFpCast = Instruction::CastOps::SIToFP;
1690 conversionFactor = VIMMED1((float)(1.0));
1691 break;
1692 case CONVERT_USCALED:
1693 SWR_INVALID("Type should not be sign extended!");
1694 conversionFactor = nullptr;
1695 break;
1696 default:
1697 SWR_ASSERT(conversionType == CONVERT_NONE);
1698 conversionFactor = nullptr;
1699 break;
1700 }
1701
1702 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1703 for (uint32_t i = 0; i < 4; i++)
1704 {
1705 if (isComponentEnabled(compMask, i))
1706 {
1707 if (compCtrl[i] == ComponentControl::StoreSrc)
1708 {
1709 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1710 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1711 // if x or y, use vi128XY permute result, else use vi128ZW
1712 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1713
1714 if (bFP) {
1715 // extract 128 bit lanes to sign extend each component
1716 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1717 }
1718 else {
1719 // extract 128 bit lanes to sign extend each component
1720 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1721
1722 // denormalize if needed
1723 if (conversionType != CONVERT_NONE) {
1724 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1725 }
1726 }
1727 currentVertexElement++;
1728 }
1729 else
1730 {
1731 #if USE_SIMD16_SHADERS
1732 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1733 #else
1734 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1735 #endif
1736 }
1737
1738 if (currentVertexElement > 3)
1739 {
1740 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1741 // reset to the next vVertexElement to output
1742 currentVertexElement = 0;
1743 }
1744 }
1745 }
1746 }
1747 // else zero extend
1748 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1749 {
1750 // pshufb masks for each component
1751 Value* vConstMask[2];
1752 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1753 // x/z shuffle mask
1754 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1755 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1756 }
1757
1758 if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1759 // y/w shuffle mask
1760 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1761 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1762 }
1763
1764 // init denormalize variables if needed
1765 Instruction::CastOps fpCast;
1766 Value* conversionFactor;
1767
1768 switch (conversionType)
1769 {
1770 case CONVERT_NORMALIZED:
1771 fpCast = Instruction::CastOps::UIToFP;
1772 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1773 break;
1774 case CONVERT_USCALED:
1775 fpCast = Instruction::CastOps::UIToFP;
1776 conversionFactor = VIMMED1((float)(1.0f));
1777 break;
1778 case CONVERT_SSCALED:
1779 SWR_INVALID("Type should not be zero extended!");
1780 conversionFactor = nullptr;
1781 break;
1782 default:
1783 SWR_ASSERT(conversionType == CONVERT_NONE);
1784 conversionFactor = nullptr;
1785 break;
1786 }
1787
1788 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1789 for (uint32_t i = 0; i < 4; i++)
1790 {
1791 if (isComponentEnabled(compMask, i))
1792 {
1793 if (compCtrl[i] == ComponentControl::StoreSrc)
1794 {
1795 // select correct constMask for x/z or y/w pshufb
1796 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1797 // if x or y, use vi128XY permute result, else use vi128ZW
1798 uint32_t selectedGather = (i < 2) ? 0 : 1;
1799
1800 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1801 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1802 // 256i - 0 1 2 3 4 5 6 7
1803 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1804
1805 // denormalize if needed
1806 if (conversionType != CONVERT_NONE)
1807 {
1808 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1809 }
1810 currentVertexElement++;
1811 }
1812 else
1813 {
1814 #if USE_SIMD16_SHADERS
1815 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1816 #else
1817 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1818 #endif
1819 }
1820
1821 if (currentVertexElement > 3)
1822 {
1823 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1824 // reset to the next vVertexElement to output
1825 currentVertexElement = 0;
1826 }
1827 }
1828 }
1829 }
1830 else
1831 {
1832 SWR_INVALID("Unsupported conversion type");
1833 }
1834 }
1835
1836 //////////////////////////////////////////////////////////////////////////
1837 /// @brief Output a simdvertex worth of elements to the current outputElt
1838 /// @param pVtxOut - base address of VIN output struct
1839 /// @param outputElt - simdvertex offset in VIN to write to
1840 /// @param numEltsToStore - number of simdvertex rows to write out
1841 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1842 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1843 {
1844 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1845
1846 for(uint32_t c = 0; c < numEltsToStore; ++c)
1847 {
1848 // STORE expects FP32 x vWidth type, just bitcast if needed
1849 if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1850 #if FETCH_DUMP_VERTEX
1851 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1852 #endif
1853 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1854 }
1855 #if FETCH_DUMP_VERTEX
1856 else
1857 {
1858 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1859 }
1860 #endif
1861 // outputElt * 4 = offsetting by the size of a simdvertex
1862 // + c offsets to a 32bit x vWidth row within the current vertex
1863 #if USE_SIMD16_SHADERS
1864 Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
1865 #else
1866 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1867 #endif
1868 STORE(vVertexElements[c], dest);
1869 }
1870 }
1871
1872 //////////////////////////////////////////////////////////////////////////
1873 /// @brief Generates a constant vector of values based on the
1874 /// ComponentControl value
1875 /// @param ctrl - ComponentControl value
1876 #if USE_SIMD16_SHADERS
1877 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
1878 #else
1879 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1880 #endif
1881 {
1882 switch(ctrl)
1883 {
1884 case NoStore: return VUNDEF_I();
1885 case Store0: return VIMMED1(0);
1886 case Store1Fp: return VIMMED1(1.0f);
1887 case Store1Int: return VIMMED1(1);
1888 case StoreVertexId:
1889 {
1890 #if USE_SIMD16_SHADERS
1891 Value* pId;
1892 if (useVertexID2)
1893 {
1894 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
1895 }
1896 else
1897 {
1898 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1899 }
1900 #else
1901 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1902 #endif
1903 return VBROADCAST(pId);
1904 }
1905 case StoreInstanceId:
1906 {
1907 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1908 return VBROADCAST(pId);
1909 }
1910 case StoreSrc:
1911 default: SWR_INVALID("Invalid component control"); return VUNDEF_I();
1912 }
1913 }
1914
1915 //////////////////////////////////////////////////////////////////////////
1916 /// @brief Returns the enable mask for the specified component.
1917 /// @param enableMask - enable bits
1918 /// @param component - component to check if enabled.
1919 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1920 {
1921 switch (component)
1922 {
1923 // X
1924 case 0: return (enableMask & ComponentEnable::X);
1925 // Y
1926 case 1: return (enableMask & ComponentEnable::Y);
1927 // Z
1928 case 2: return (enableMask & ComponentEnable::Z);
1929 // W
1930 case 3: return (enableMask & ComponentEnable::W);
1931
1932 default: return false;
1933 }
1934 }
1935
1936
1937 //////////////////////////////////////////////////////////////////////////
1938 /// @brief JITs from fetch shader IR
1939 /// @param hJitMgr - JitManager handle
1940 /// @param func - LLVM function IR
1941 /// @return PFN_FETCH_FUNC - pointer to fetch code
1942 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1943 {
1944 const llvm::Function* func = (const llvm::Function*)hFunc;
1945 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1946 PFN_FETCH_FUNC pfnFetch;
1947
1948 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1949 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1950 pJitMgr->mIsModuleFinalized = true;
1951
1952 #if defined(KNOB_SWRC_TRACING)
1953 char fName[1024];
1954 const char *funcName = func->getName().data();
1955 sprintf(fName, "%s.bin", funcName);
1956 FILE *fd = fopen(fName, "wb");
1957 fwrite((void *)pfnFetch, 1, 2048, fd);
1958 fclose(fd);
1959 #endif
1960
1961 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
1962
1963 return pfnFetch;
1964 }
1965
1966 //////////////////////////////////////////////////////////////////////////
1967 /// @brief JIT compiles fetch shader
1968 /// @param hJitMgr - JitManager handle
1969 /// @param state - fetch state to build function from
1970 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1971 {
1972 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1973
1974 pJitMgr->SetupNewModule();
1975
1976 FetchJit theJit(pJitMgr);
1977 HANDLE hFunc = theJit.Create(state);
1978
1979 return JitFetchFunc(hJitMgr, hFunc);
1980 }