fbd8ecb3a18ac4f49d832c2eb062ac9bc09bbc33
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_api.h"
31 #include "fetch_jit.h"
32 #include "builder.h"
33 #include "state_llvm.h"
34 #include <sstream>
35 #include <tuple>
36
37 //#define FETCH_DUMP_VERTEX 1
38
39 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
40
41 enum ConversionType
42 {
43 CONVERT_NONE,
44 CONVERT_NORMALIZED,
45 CONVERT_USCALED,
46 CONVERT_SSCALED,
47 };
48
49 //////////////////////////////////////////////////////////////////////////
50 /// Interface to Jitting a fetch shader
51 //////////////////////////////////////////////////////////////////////////
52 struct FetchJit : public Builder
53 {
54 FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
55
56 Function* Create(const FETCH_COMPILE_STATE& fetchState);
57 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
58 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
59 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
60
61 // package up Shuffle*bpcGatherd args into a tuple for convenience
62 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
63 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
64 const uint32_t(&)[4], Value*, bool, uint32_t, bool, uint32_t> Shuffle8bpcArgs;
65 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
66
67 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
68 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
69 Value*, bool, uint32_t, bool, uint32_t> Shuffle16bpcArgs;
70 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
71
72 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
73
74 Value* GenerateCompCtrlVector(const ComponentControl ctrl);
75
76 void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
77 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
78
79 bool IsOddFormat(SWR_FORMAT format);
80 bool IsUniformFormat(SWR_FORMAT format);
81 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
82 void CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4]);
83 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
84
85 };
86
87 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
88 {
89 static std::size_t fetchNum = 0;
90
91 std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
92 fnName << fetchNum++;
93
94 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
95 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
96
97 IRB()->SetInsertPoint(entry);
98
99 auto argitr = fetch->getArgumentList().begin();
100
101 // Fetch shader arguments
102 Value* fetchInfo = &*argitr; ++argitr;
103 fetchInfo->setName("fetchInfo");
104 Value* pVtxOut = &*argitr;
105 pVtxOut->setName("vtxOutput");
106 // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
107 // index 0(just the pointer to the simdvertex structure
108 // index 1(which element of the simdvertex structure to offset to(in this case 0)
109 // so the indices being i32's doesn't matter
110 // TODO: generated this GEP with a VECTOR structure type so this makes sense
111 std::vector<Value*> vtxInputIndices(2, C(0));
112 // GEP
113 pVtxOut = GEP(pVtxOut, C(0));
114 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
115
116 // SWR_FETCH_CONTEXT::pStreams
117 Value* streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
118 streams->setName("pStreams");
119
120 // SWR_FETCH_CONTEXT::pIndices
121 Value* indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
122 indices->setName("pIndices");
123
124 // SWR_FETCH_CONTEXT::pLastIndex
125 Value* pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
126 pLastIndex->setName("pLastIndex");
127
128
129 Value* vIndices;
130 switch(fetchState.indexType)
131 {
132 case R8_UINT:
133 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
134 if(fetchState.bDisableIndexOOBCheck){
135 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
136 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
137 }
138 else{
139 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
140 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
141 }
142 break;
143 case R16_UINT:
144 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
145 if(fetchState.bDisableIndexOOBCheck){
146 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
147 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
148 }
149 else{
150 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
151 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
152 }
153 break;
154 case R32_UINT:
155 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
156 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
157 break; // incoming type is already 32bit int
158 default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
159 }
160
161 // store out vertex IDs
162 STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
163
164 // store out cut mask if enabled
165 if (fetchState.bEnableCutIndex)
166 {
167 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
168 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
169 STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
170 }
171
172 // Fetch attributes from memory and output to a simdvertex struct
173 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
174 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut)
175 : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut);
176
177 RET_VOID();
178
179 JitManager::DumpToFile(fetch, "src");
180
181 verifyFunction(*fetch);
182
183 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
184
185 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
186 setupPasses.add(createBreakCriticalEdgesPass());
187 setupPasses.add(createCFGSimplificationPass());
188 setupPasses.add(createEarlyCSEPass());
189 setupPasses.add(createPromoteMemoryToRegisterPass());
190
191 setupPasses.run(*fetch);
192
193 JitManager::DumpToFile(fetch, "se");
194
195 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
196
197 ///@todo Haven't touched these either. Need to remove some of these and add others.
198 optPasses.add(createCFGSimplificationPass());
199 optPasses.add(createEarlyCSEPass());
200 optPasses.add(createInstructionCombiningPass());
201 optPasses.add(createInstructionSimplifierPass());
202 optPasses.add(createConstantPropagationPass());
203 optPasses.add(createSCCPPass());
204 optPasses.add(createAggressiveDCEPass());
205
206 optPasses.run(*fetch);
207 optPasses.run(*fetch);
208
209 JitManager::DumpToFile(fetch, "opt");
210
211 return fetch;
212 }
213
214 //////////////////////////////////////////////////////////////////////////
215 /// @brief Loads attributes from memory using LOADs, shuffling the
216 /// components into SOA form.
217 /// *Note* currently does not support component control,
218 /// component packing, instancing, InstanceID SGVs, or VertexID SGVs
219 /// @param fetchState - info about attributes to be fetched from memory
220 /// @param streams - value pointer to the current vertex stream
221 /// @param vIndices - vector value of indices to load
222 /// @param pVtxOut - value pointer to output simdvertex struct
223 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut)
224 {
225 // Zack shuffles; a variant of the Charleston.
226
227 std::vector<Value*> vectors(16);
228 std::vector<Constant*> pMask(mVWidth);
229 for(uint32_t i = 0; i < mVWidth; ++i)
230 {
231 pMask[i] = (C(i < 4 ? i : 4));
232 }
233 Constant* promoteMask = ConstantVector::get(pMask);
234 Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
235
236 Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
237 Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
238 Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
239 Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
240 curInstance->setName("curInstance");
241
242 for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
243 {
244 Value* elements[4] = {0};
245 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
246 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
247 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
248 uint32_t numComponents = info.numComps;
249 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
250
251 vectors.clear();
252
253 Value *vCurIndices;
254 Value *startOffset;
255 if(ied.InstanceEnable)
256 {
257 Value* stepRate = C(ied.InstanceDataStepRate);
258
259 // prevent a div by 0 for 0 step rate
260 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
261 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
262
263 // calc the current offset into instanced data buffer
264 Value* calcInstance = UDIV(curInstance, stepRate);
265
266 // if step rate is 0, every instance gets instance 0
267 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
268
269 vCurIndices = VBROADCAST(calcInstance);
270
271 startOffset = startInstance;
272 }
273 else
274 {
275 // offset indices by baseVertex
276 vCurIndices = ADD(vIndices, vBaseVertex);
277
278 startOffset = startVertex;
279 }
280
281 // load SWR_VERTEX_BUFFER_STATE::pData
282 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
283
284 // load SWR_VERTEX_BUFFER_STATE::pitch
285 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
286 stride = Z_EXT(stride, mInt64Ty);
287
288 // load SWR_VERTEX_BUFFER_STATE::size
289 Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
290 size = Z_EXT(size, mInt64Ty);
291
292 Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
293
294 // Load from the stream.
295 for(uint32_t lane = 0; lane < mVWidth; ++lane)
296 {
297 // Get index
298 Value* index = VEXTRACT(vCurIndices, C(lane));
299 index = Z_EXT(index, mInt64Ty);
300
301 Value* offset = MUL(index, stride);
302 offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
303 offset = ADD(offset, startVertexOffset);
304
305 if (!fetchState.bDisableIndexOOBCheck) {
306 // check for out of bound access, including partial OOB, and mask them to 0
307 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
308 Value *oob = ICMP_ULE(endOffset, size);
309 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
310 }
311
312 Value* pointer = GEP(stream, offset);
313 // We use a full-lane, but don't actually care.
314 Value* vptr = 0;
315
316 // get a pointer to a 4 component attrib in default address space
317 switch(bpc)
318 {
319 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
320 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
321 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
322 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
323 }
324
325 // load 4 components of attribute
326 Value* vec = ALIGNED_LOAD(vptr, 1, false);
327
328 // Convert To FP32 internally
329 switch(info.type[0])
330 {
331 case SWR_TYPE_UNORM:
332 switch(bpc)
333 {
334 case 8:
335 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
336 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
337 break;
338 case 16:
339 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
340 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
341 break;
342 default:
343 SWR_ASSERT(false, "Unsupported underlying type!");
344 break;
345 }
346 break;
347 case SWR_TYPE_SNORM:
348 switch(bpc)
349 {
350 case 8:
351 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
352 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
353 break;
354 case 16:
355 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
356 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
357 break;
358 default:
359 SWR_ASSERT(false, "Unsupported underlying type!");
360 break;
361 }
362 break;
363 case SWR_TYPE_UINT:
364 // Zero extend uint32_t types.
365 switch(bpc)
366 {
367 case 8:
368 case 16:
369 vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
370 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
371 break;
372 case 32:
373 break; // Pass through unchanged.
374 default:
375 SWR_ASSERT(false, "Unsupported underlying type!");
376 break;
377 }
378 break;
379 case SWR_TYPE_SINT:
380 // Sign extend SINT types.
381 switch(bpc)
382 {
383 case 8:
384 case 16:
385 vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
386 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
387 break;
388 case 32:
389 break; // Pass through unchanged.
390 default:
391 SWR_ASSERT(false, "Unsupported underlying type!");
392 break;
393 }
394 break;
395 case SWR_TYPE_FLOAT:
396 switch(bpc)
397 {
398 case 32:
399 break; // Pass through unchanged.
400 default:
401 SWR_ASSERT(false, "Unsupported underlying type!");
402 }
403 break;
404 case SWR_TYPE_USCALED:
405 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
406 break;
407 case SWR_TYPE_SSCALED:
408 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
409 break;
410 case SWR_TYPE_UNKNOWN:
411 case SWR_TYPE_UNUSED:
412 SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
413 }
414
415 // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
416 // uwvec: 4 x F32, undef value
417 Value* wvec = VSHUFFLE(vec, uwvec, promoteMask);
418 vectors.push_back(wvec);
419 }
420
421 std::vector<Constant*> v01Mask(mVWidth);
422 std::vector<Constant*> v23Mask(mVWidth);
423 std::vector<Constant*> v02Mask(mVWidth);
424 std::vector<Constant*> v13Mask(mVWidth);
425
426 // Concatenate the vectors together.
427 elements[0] = VUNDEF_F();
428 elements[1] = VUNDEF_F();
429 elements[2] = VUNDEF_F();
430 elements[3] = VUNDEF_F();
431 for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
432 {
433 v01Mask[4 * b + 0] = C(0 + 4 * b);
434 v01Mask[4 * b + 1] = C(1 + 4 * b);
435 v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
436 v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
437
438 v23Mask[4 * b + 0] = C(2 + 4 * b);
439 v23Mask[4 * b + 1] = C(3 + 4 * b);
440 v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
441 v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
442
443 v02Mask[4 * b + 0] = C(0 + 4 * b);
444 v02Mask[4 * b + 1] = C(2 + 4 * b);
445 v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
446 v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
447
448 v13Mask[4 * b + 0] = C(1 + 4 * b);
449 v13Mask[4 * b + 1] = C(3 + 4 * b);
450 v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
451 v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
452
453 std::vector<Constant*> iMask(mVWidth);
454 for(uint32_t i = 0; i < mVWidth; ++i)
455 {
456 if(((4 * b) <= i) && (i < (4 * (b + 1))))
457 {
458 iMask[i] = C(i % 4 + mVWidth);
459 }
460 else
461 {
462 iMask[i] = C(i);
463 }
464 }
465 Constant* insertMask = ConstantVector::get(iMask);
466 elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
467 elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
468 elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
469 elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
470 }
471
472 Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
473 Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
474 Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
475 Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
476 elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
477 elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
478 elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
479 elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
480
481 switch(numComponents + 1)
482 {
483 case 1: elements[0] = VIMMED1(0.0f);
484 case 2: elements[1] = VIMMED1(0.0f);
485 case 3: elements[2] = VIMMED1(0.0f);
486 case 4: elements[3] = VIMMED1(1.0f);
487 }
488
489 for(uint32_t c = 0; c < 4; ++c)
490 {
491 Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
492 STORE(elements[c], dest);
493 }
494 }
495 }
496
497 // returns true for odd formats that require special state.gather handling
498 bool FetchJit::IsOddFormat(SWR_FORMAT format)
499 {
500 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
501 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
502 {
503 return true;
504 }
505 return false;
506 }
507
508 // format is uniform if all components are the same size and type
509 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
510 {
511 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
512 uint32_t bpc0 = info.bpc[0];
513 uint32_t type0 = info.type[0];
514
515 for (uint32_t c = 1; c < info.numComps; ++c)
516 {
517 if (bpc0 != info.bpc[c] || type0 != info.type[c])
518 {
519 return false;
520 }
521 }
522 return true;
523 }
524
525 // unpacks components based on format
526 // foreach component in the pixel
527 // mask off everything but this component
528 // shift component to LSB
529 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
530 {
531 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
532
533 uint32_t bitOffset = 0;
534 for (uint32_t c = 0; c < info.numComps; ++c)
535 {
536 uint32_t swizzledIndex = info.swizzle[c];
537 uint32_t compBits = info.bpc[c];
538 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
539 Value* comp = AND(vInput, bitmask);
540 comp = LSHR(comp, bitOffset);
541
542 result[swizzledIndex] = comp;
543 bitOffset += compBits;
544 }
545 }
546
547 // gather for odd component size formats
548 // gather SIMD full pixels per lane then shift/mask to move each component to their
549 // own vector
550 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4])
551 {
552 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
553
554 // only works if pixel size is <= 32bits
555 SWR_ASSERT(info.bpp <= 32);
556
557 Value* gather = VUNDEF_I();
558
559 // assign defaults
560 for (uint32_t comp = 0; comp < 4; ++comp)
561 {
562 result[comp] = VIMMED1((int)info.defaults[comp]);
563 }
564
565 // gather SIMD pixels
566 for (uint32_t e = 0; e < JM()->mVWidth; ++e)
567 {
568 Value* elemOffset = VEXTRACT(offsets, C(e));
569 Value* load = GEP(pBase, elemOffset);
570
571 // load the proper amount of data based on component size
572 switch (info.bpp)
573 {
574 case 8: load = POINTER_CAST(load, Type::getInt8PtrTy(JM()->mContext)); break;
575 case 16: load = POINTER_CAST(load, Type::getInt16PtrTy(JM()->mContext)); break;
576 case 32: load = POINTER_CAST(load, Type::getInt32PtrTy(JM()->mContext)); break;
577 default: SWR_ASSERT(0);
578 }
579
580 // load pixel
581 Value *val = LOAD(load);
582
583 // zero extend to 32bit integer
584 val = INT_CAST(val, mInt32Ty, false);
585
586 // store in simd lane
587 gather = VINSERT(gather, val, C(e));
588 }
589
590 UnpackComponents(format, gather, result);
591
592 // cast to fp32
593 result[0] = BITCAST(result[0], mSimdFP32Ty);
594 result[1] = BITCAST(result[1], mSimdFP32Ty);
595 result[2] = BITCAST(result[2], mSimdFP32Ty);
596 result[3] = BITCAST(result[3], mSimdFP32Ty);
597 }
598
599 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
600 {
601 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
602
603 for (uint32_t c = 0; c < info.numComps; ++c)
604 {
605 uint32_t compIndex = info.swizzle[c];
606
607 // skip any conversion on UNUSED components
608 if (info.type[c] == SWR_TYPE_UNUSED)
609 {
610 continue;
611 }
612
613 if (info.isNormalized[c])
614 {
615 if (info.type[c] == SWR_TYPE_SNORM)
616 {
617 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
618
619 /// result = c * (1.0f / (2^(n-1) - 1);
620 uint32_t n = info.bpc[c];
621 uint32_t pow2 = 1 << (n - 1);
622 float scale = 1.0f / (float)(pow2 - 1);
623 Value *vScale = VIMMED1(scale);
624 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
625 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
626 texels[compIndex] = FMUL(texels[compIndex], vScale);
627 }
628 else
629 {
630 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
631
632 /// result = c * (1.0f / (2^n - 1))
633 uint32_t n = info.bpc[c];
634 uint32_t pow2 = 1 << n;
635 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
636 if (n == 24)
637 {
638 float scale = (float)(pow2 - 1);
639 Value* vScale = VIMMED1(scale);
640 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
641 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
642 texels[compIndex] = FDIV(texels[compIndex], vScale);
643 }
644 else
645 {
646 float scale = 1.0f / (float)(pow2 - 1);
647 Value *vScale = VIMMED1(scale);
648 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
649 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
650 texels[compIndex] = FMUL(texels[compIndex], vScale);
651 }
652 }
653 continue;
654 }
655 }
656 }
657
658 //////////////////////////////////////////////////////////////////////////
659 /// @brief Loads attributes from memory using AVX2 GATHER(s)
660 /// @param fetchState - info about attributes to be fetched from memory
661 /// @param fetchInfo - first argument passed to fetch shader
662 /// @param streams - value pointer to the current vertex stream
663 /// @param vIndices - vector value of indices to gather
664 /// @param pVtxOut - value pointer to output simdvertex struct
665 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo,
666 Value* streams, Value* vIndices, Value* pVtxOut)
667 {
668 uint32_t currentVertexElement = 0;
669 uint32_t outputElt = 0;
670 Value* vVertexElements[4];
671
672 Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
673 Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
674 Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
675 Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
676 curInstance->setName("curInstance");
677
678 for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
679 {
680 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
681 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
682 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
683 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
684
685 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
686
687 // VGATHER* takes an *i8 src pointer
688 Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
689
690 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
691 Value *vStride = VBROADCAST(stride);
692
693 // max vertex index that is fully in bounds
694 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
695 maxVertex = LOAD(maxVertex);
696
697 Value *vCurIndices;
698 Value *startOffset;
699 if(ied.InstanceEnable)
700 {
701 Value* stepRate = C(ied.InstanceDataStepRate);
702
703 // prevent a div by 0 for 0 step rate
704 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
705 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
706
707 // calc the current offset into instanced data buffer
708 Value* calcInstance = UDIV(curInstance, stepRate);
709
710 // if step rate is 0, every instance gets instance 0
711 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
712
713 vCurIndices = VBROADCAST(calcInstance);
714
715 startOffset = startInstance;
716 }
717 else
718 {
719 // offset indices by baseVertex
720 vCurIndices = ADD(vIndices, vBaseVertex);
721
722 startOffset = startVertex;
723 }
724
725 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
726 // do 64bit address offset calculations.
727
728 // calculate byte offset to the start of the VB
729 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
730 pStreamBase = GEP(pStreamBase, baseOffset);
731
732 // if we have a start offset, subtract from max vertex. Used for OOB check
733 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
734 Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
735 // if we have a negative value, we're already OOB. clamp at 0.
736 maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
737
738 // Load the in bounds size of a partially valid vertex
739 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
740 partialInboundsSize = LOAD(partialInboundsSize);
741 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
742 Value* vBpp = VBROADCAST(C(info.Bpp));
743 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
744
745 // is the element is <= the partially valid size
746 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
747
748 // are vertices partially OOB?
749 Value* vMaxVertex = VBROADCAST(maxVertex);
750 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
751
752 // are vertices are fully in bounds?
753 Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
754
755 // blend in any partially OOB indices that have valid elements
756 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
757 vGatherMask = VMASK(vGatherMask);
758
759 // calculate the actual offsets into the VB
760 Value* vOffsets = MUL(vCurIndices, vStride);
761 vOffsets = ADD(vOffsets, vAlignmentOffsets);
762
763 // Packing and component control
764 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
765 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
766 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
767
768 // Special gather/conversion for formats without equal component sizes
769 if (IsOddFormat((SWR_FORMAT)ied.Format))
770 {
771 // Only full 4 component fetch is supported for odd formats
772 SWR_ASSERT(compMask == XYZW);
773 Value* pResults[4];
774 CreateGatherOddFormats((SWR_FORMAT)ied.Format, pStreamBase, vOffsets, pResults);
775 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
776
777 // check for InstanceID SGV
778 if (fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt))
779 {
780 SWR_ASSERT(fetchState.InstanceIdComponentNumber < (sizeof(pResults) / sizeof(pResults[0])));
781
782 // Load a SIMD of InstanceIDs
783 pResults[fetchState.InstanceIdComponentNumber] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance }))); // InstanceID
784 }
785 // check for VertexID SGV
786 else if (fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt))
787 {
788 SWR_ASSERT(fetchState.VertexIdComponentNumber < (sizeof(pResults) / sizeof(pResults[0])));
789
790 // Load a SIMD of VertexIDs
791 pResults[fetchState.VertexIdComponentNumber] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
792 }
793
794 StoreVertexElements(pVtxOut, outputElt++, 4, pResults);
795 currentVertexElement = 0;
796 }
797 else if(info.type[0] == SWR_TYPE_FLOAT)
798 {
799 ///@todo: support 64 bit vb accesses
800 Value* gatherSrc = VIMMED1(0.0f);
801
802 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
803 "Unsupported format for standard gather fetch.");
804
805 // Gather components from memory to store in a simdvertex structure
806 switch(bpc)
807 {
808 case 16:
809 {
810 Value* vGatherResult[2];
811 Value *vMask;
812
813 // if we have at least one component out of x or y to fetch
814 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
815 // save mask as it is zero'd out after each gather
816 vMask = vGatherMask;
817
818 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
819 // e.g. result of first 8x32bit integer gather for 16bit components
820 // 256i - 0 1 2 3 4 5 6 7
821 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
822 //
823 }
824
825 // if we have at least one component out of z or w to fetch
826 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
827 // offset base to the next components(zw) in the vertex to gather
828 pStreamBase = GEP(pStreamBase, C((char)4));
829 vMask = vGatherMask;
830
831 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
832 // e.g. result of second 8x32bit integer gather for 16bit components
833 // 256i - 0 1 2 3 4 5 6 7
834 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
835 //
836 }
837
838 // if we have at least one component to shuffle into place
839 if(compMask){
840 const bool instanceIdEnable = (fetchState.InstanceIdEnable) && (fetchState.InstanceIdElementOffset == nInputElt);
841 const bool vertexIdEnable = (fetchState.VertexIdEnable) && (fetchState.VertexIdElementOffset == nInputElt);
842
843 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
844 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, fetchInfo, instanceIdEnable,
845 fetchState.InstanceIdComponentNumber, vertexIdEnable, fetchState.VertexIdComponentNumber);
846
847 // Shuffle gathered components into place in simdvertex struct
848 Shuffle16bpcGather(args); // outputs to vVertexElements ref
849 }
850 }
851 break;
852 case 32:
853 {
854 for (uint32_t i = 0; i < 4; i++)
855 {
856 if (isComponentEnabled(compMask, i))
857 {
858 // check for InstanceID SGV
859 if ((fetchState.InstanceIdEnable) && (fetchState.InstanceIdElementOffset == nInputElt) && (fetchState.InstanceIdComponentNumber == currentVertexElement))
860 {
861 // Load a SIMD of InstanceIDs
862 vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance }))); // InstanceID
863 }
864 // check for VertexID SGV
865 else if ((fetchState.VertexIdEnable) && (fetchState.VertexIdElementOffset == nInputElt) && (fetchState.VertexIdComponentNumber == currentVertexElement))
866 {
867 // Load a SIMD of VertexIDs
868 vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
869 }
870 // if we need to gather the component
871 else if (compCtrl[i] == StoreSrc)
872 {
873 // save mask as it is zero'd out after each gather
874 Value *vMask = vGatherMask;
875
876 // Gather a SIMD of vertices
877 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
878 }
879 else
880 {
881 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
882 }
883
884 if (currentVertexElement > 3)
885 {
886 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
887 // reset to the next vVertexElement to output
888 currentVertexElement = 0;
889 }
890
891 }
892
893 // offset base to the next component in the vertex to gather
894 pStreamBase = GEP(pStreamBase, C((char)4));
895 }
896 }
897 break;
898 default:
899 SWR_ASSERT(0, "Tried to fetch invalid FP format");
900 break;
901 }
902 }
903 else
904 {
905 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
906 ConversionType conversionType = CONVERT_NONE;
907
908 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
909 "Unsupported format for standard gather fetch.");
910
911 switch(info.type[0])
912 {
913 case SWR_TYPE_UNORM:
914 conversionType = CONVERT_NORMALIZED;
915 case SWR_TYPE_UINT:
916 extendCastType = Instruction::CastOps::ZExt;
917 break;
918 case SWR_TYPE_SNORM:
919 conversionType = CONVERT_NORMALIZED;
920 case SWR_TYPE_SINT:
921 extendCastType = Instruction::CastOps::SExt;
922 break;
923 case SWR_TYPE_USCALED:
924 conversionType = CONVERT_USCALED;
925 extendCastType = Instruction::CastOps::UIToFP;
926 break;
927 case SWR_TYPE_SSCALED:
928 conversionType = CONVERT_SSCALED;
929 extendCastType = Instruction::CastOps::SIToFP;
930 break;
931 default:
932 break;
933 }
934
935 // value substituted when component of gather is masked
936 Value* gatherSrc = VIMMED1(0);
937
938 // Gather components from memory to store in a simdvertex structure
939 switch (bpc)
940 {
941 case 8:
942 {
943 // if we have at least one component to fetch
944 if(compMask)
945 {
946 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
947 // e.g. result of an 8x32bit integer gather for 8bit components
948 // 256i - 0 1 2 3 4 5 6 7
949 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
950
951 const bool instanceIdEnable = fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt);
952 const bool vertexIdEnable = fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt);
953
954 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
955 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle, fetchInfo,
956 instanceIdEnable, fetchState.InstanceIdComponentNumber, vertexIdEnable, fetchState.VertexIdComponentNumber);
957
958 // Shuffle gathered components into place in simdvertex struct
959 Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
960 }
961 }
962 break;
963 case 16:
964 {
965 Value* vGatherResult[2];
966 Value *vMask;
967
968 // if we have at least one component out of x or y to fetch
969 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
970 // save mask as it is zero'd out after each gather
971 vMask = vGatherMask;
972
973 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
974 // e.g. result of first 8x32bit integer gather for 16bit components
975 // 256i - 0 1 2 3 4 5 6 7
976 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
977 //
978 }
979
980 // if we have at least one component out of z or w to fetch
981 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
982 // offset base to the next components(zw) in the vertex to gather
983 pStreamBase = GEP(pStreamBase, C((char)4));
984 vMask = vGatherMask;
985
986 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
987 // e.g. result of second 8x32bit integer gather for 16bit components
988 // 256i - 0 1 2 3 4 5 6 7
989 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
990 //
991 }
992
993 // if we have at least one component to shuffle into place
994 if(compMask){
995 const bool instanceIdEnable = fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt);
996 const bool vertexIdEnable = fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt);
997
998 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
999 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, fetchInfo, instanceIdEnable,
1000 fetchState.InstanceIdComponentNumber, vertexIdEnable, fetchState.VertexIdComponentNumber);
1001
1002 // Shuffle gathered components into place in simdvertex struct
1003 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1004 }
1005 }
1006 break;
1007 case 32:
1008 {
1009 SWR_ASSERT(conversionType == CONVERT_NONE);
1010
1011 // Gathered components into place in simdvertex struct
1012 for (uint32_t i = 0; i < 4; i++)
1013 {
1014 if (isComponentEnabled(compMask, i))
1015 {
1016 // check for InstanceID SGV
1017 if (fetchState.InstanceIdEnable && (fetchState.InstanceIdElementOffset == nInputElt) && (fetchState.InstanceIdComponentNumber == currentVertexElement))
1018 {
1019 // Load a SIMD of InstanceIDs
1020 vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance }))); // InstanceID
1021 }
1022 // check for VertexID SGV
1023 else if (fetchState.VertexIdEnable && (fetchState.VertexIdElementOffset == nInputElt) && (fetchState.VertexIdComponentNumber == currentVertexElement))
1024 {
1025 // Load a SIMD of VertexIDs
1026 vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1027 }
1028 // if we need to gather the component
1029 else if (compCtrl[i] == StoreSrc)
1030 {
1031 // save mask as it is zero'd out after each gather
1032 Value *vMask = vGatherMask;
1033
1034 vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1035
1036 // e.g. result of a single 8x32bit integer gather for 32bit components
1037 // 256i - 0 1 2 3 4 5 6 7
1038 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1039 }
1040 else
1041 {
1042 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1043 }
1044
1045 if (currentVertexElement > 3)
1046 {
1047 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1048 // reset to the next vVertexElement to output
1049 currentVertexElement = 0;
1050 }
1051
1052 }
1053
1054 // offset base to the next component in the vertex to gather
1055 pStreamBase = GEP(pStreamBase, C((char)4));
1056 }
1057 }
1058 break;
1059 }
1060 }
1061 }
1062
1063 // if we have a partially filled vVertexElement struct, output it
1064 if(currentVertexElement > 0){
1065 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1066 }
1067 }
1068
1069 //////////////////////////////////////////////////////////////////////////
1070 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1071 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1072 /// support
1073 /// @param pIndices - pointer to 8 bit indices
1074 /// @param pLastIndex - pointer to last valid index
1075 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1076 {
1077 // can fit 2 16 bit integers per vWidth lane
1078 Value* vIndices = VUNDEF_I();
1079
1080 // store 0 index on stack to be used to conditionally load from if index address is OOB
1081 Value* pZeroIndex = ALLOCA(mInt8Ty);
1082 STORE(C((uint8_t)0), pZeroIndex);
1083
1084 // Load a SIMD of index pointers
1085 for(int64_t lane = 0; lane < mVWidth; lane++)
1086 {
1087 // Calculate the address of the requested index
1088 Value *pIndex = GEP(pIndices, C(lane));
1089
1090 // check if the address is less than the max index,
1091 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1092
1093 // if valid, load the index. if not, load 0 from the stack
1094 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1095 Value *index = LOAD(pValid, "valid index");
1096
1097 // zero extended index to 32 bits and insert into the correct simd lane
1098 index = Z_EXT(index, mInt32Ty);
1099 vIndices = VINSERT(vIndices, index, lane);
1100 }
1101 return vIndices;
1102 }
1103
1104 //////////////////////////////////////////////////////////////////////////
1105 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1106 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1107 /// support
1108 /// @param pIndices - pointer to 16 bit indices
1109 /// @param pLastIndex - pointer to last valid index
1110 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1111 {
1112 // can fit 2 16 bit integers per vWidth lane
1113 Value* vIndices = VUNDEF_I();
1114
1115 // store 0 index on stack to be used to conditionally load from if index address is OOB
1116 Value* pZeroIndex = ALLOCA(mInt16Ty);
1117 STORE(C((uint16_t)0), pZeroIndex);
1118
1119 // Load a SIMD of index pointers
1120 for(int64_t lane = 0; lane < mVWidth; lane++)
1121 {
1122 // Calculate the address of the requested index
1123 Value *pIndex = GEP(pIndices, C(lane));
1124
1125 // check if the address is less than the max index,
1126 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1127
1128 // if valid, load the index. if not, load 0 from the stack
1129 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1130 Value *index = LOAD(pValid, "valid index");
1131
1132 // zero extended index to 32 bits and insert into the correct simd lane
1133 index = Z_EXT(index, mInt32Ty);
1134 vIndices = VINSERT(vIndices, index, lane);
1135 }
1136 return vIndices;
1137 }
1138
1139 //////////////////////////////////////////////////////////////////////////
1140 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1141 /// @param pIndices - pointer to 32 bit indices
1142 /// @param pLastIndex - pointer to last valid index
1143 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1144 {
1145 DataLayout dL(JM()->mpCurrentModule);
1146 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1147 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1148 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1149
1150 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1151 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1152 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1153 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1154
1155 // create a vector of index counts from the base index ptr passed into the fetch
1156 const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1157 Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1158
1159 // compare index count to the max valid index
1160 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1161 // vIndexOffsets 0 1 2 3 4 5 6 7
1162 // ------------------------------
1163 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1164 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1165 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1166 Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1167
1168 // VMASKLOAD takes an *i8 src pointer
1169 pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1170
1171 // Load the indices; OOB loads 0
1172 return MASKLOADD(pIndices,vIndexMask);
1173 }
1174
1175 //////////////////////////////////////////////////////////////////////////
1176 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1177 /// denormalizes if needed, converts to F32 if needed, and positions in
1178 // the proper SIMD rows to be output to the simdvertex structure
1179 /// @param args: (tuple of args, listed below)
1180 /// @param vGatherResult - 8 gathered 8bpc vertices
1181 /// @param pVtxOut - base pointer to output simdvertex struct
1182 /// @param extendType - sign extend or zero extend
1183 /// @param bNormalized - do we need to denormalize?
1184 /// @param currentVertexElement - reference to the current vVertexElement
1185 /// @param outputElt - reference to the current offset from simdvertex we're o
1186 /// @param compMask - component packing mask
1187 /// @param compCtrl - component control val
1188 /// @param vVertexElements[4] - vertex components to output
1189 /// @param swizzle[4] - component swizzle location
1190 /// @param fetchInfo - fetch shader info
1191 /// @param instanceIdEnable - InstanceID enabled?
1192 /// @param instanceIdComponentNumber - InstanceID component override
1193 /// @param vertexIdEnable - VertexID enabled?
1194 /// @param vertexIdComponentNumber - VertexID component override
1195 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1196 {
1197 // Unpack tuple args
1198 Value*& vGatherResult = std::get<0>(args);
1199 Value* pVtxOut = std::get<1>(args);
1200 const Instruction::CastOps extendType = std::get<2>(args);
1201 const ConversionType conversionType = std::get<3>(args);
1202 uint32_t &currentVertexElement = std::get<4>(args);
1203 uint32_t &outputElt = std::get<5>(args);
1204 const ComponentEnable compMask = std::get<6>(args);
1205 const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1206 Value* (&vVertexElements)[4] = std::get<8>(args);
1207 const uint32_t (&swizzle)[4] = std::get<9>(args);
1208 Value *fetchInfo = std::get<10>(args);
1209 const bool instanceIdEnable = std::get<11>(args);
1210 const uint32_t instanceIdComponentNumber = std::get<12>(args);
1211 const bool vertexIdEnable = std::get<13>(args);
1212 const uint32_t vertexIdComponentNumber = std::get<14>(args);
1213
1214 // cast types
1215 Type* vGatherTy = mSimdInt32Ty;
1216 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1217
1218 // have to do extra work for sign extending
1219 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1220 Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1221 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1222
1223 // shuffle mask, including any swizzling
1224 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1225 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1226 Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1227 char(y), char(y+4), char(y+8), char(y+12),
1228 char(z), char(z+4), char(z+8), char(z+12),
1229 char(w), char(w+4), char(w+8), char(w+12),
1230 char(x), char(x+4), char(x+8), char(x+12),
1231 char(y), char(y+4), char(y+8), char(y+12),
1232 char(z), char(z+4), char(z+8), char(z+12),
1233 char(w), char(w+4), char(w+8), char(w+12)});
1234
1235 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1236 // after pshufb: group components together in each 128bit lane
1237 // 256i - 0 1 2 3 4 5 6 7
1238 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1239
1240 Value* vi128XY = nullptr;
1241 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1242 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1243 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1244 // 256i - 0 1 2 3 4 5 6 7
1245 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1246 }
1247
1248 // do the same for zw components
1249 Value* vi128ZW = nullptr;
1250 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1251 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1252 }
1253
1254 // init denormalize variables if needed
1255 Instruction::CastOps fpCast;
1256 Value* conversionFactor;
1257
1258 switch (conversionType)
1259 {
1260 case CONVERT_NORMALIZED:
1261 fpCast = Instruction::CastOps::SIToFP;
1262 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1263 break;
1264 case CONVERT_SSCALED:
1265 fpCast = Instruction::CastOps::SIToFP;
1266 conversionFactor = VIMMED1((float)(1.0));
1267 break;
1268 case CONVERT_USCALED:
1269 SWR_ASSERT(0, "Type should not be sign extended!");
1270 conversionFactor = nullptr;
1271 break;
1272 default:
1273 SWR_ASSERT(conversionType == CONVERT_NONE);
1274 conversionFactor = nullptr;
1275 break;
1276 }
1277
1278 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1279 for (uint32_t i = 0; i < 4; i++)
1280 {
1281 if (isComponentEnabled(compMask, i))
1282 {
1283 // check for InstanceID SGV
1284 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1285 {
1286 // Load a SIMD of InstanceIDs
1287 vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance }))); // InstanceID
1288 }
1289 // check for VertexID SGV
1290 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1291 {
1292 // Load a SIMD of VertexIDs
1293 vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1294 }
1295 else if (compCtrl[i] == ComponentControl::StoreSrc)
1296 {
1297 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1298 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1299 // if x or y, use vi128XY permute result, else use vi128ZW
1300 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1301
1302 // sign extend
1303 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1304
1305 // denormalize if needed
1306 if (conversionType != CONVERT_NONE)
1307 {
1308 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1309 }
1310 currentVertexElement++;
1311 }
1312 else
1313 {
1314 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1315 }
1316
1317 if (currentVertexElement > 3)
1318 {
1319 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1320 // reset to the next vVertexElement to output
1321 currentVertexElement = 0;
1322 }
1323 }
1324 }
1325 }
1326 // else zero extend
1327 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1328 {
1329 // init denormalize variables if needed
1330 Instruction::CastOps fpCast;
1331 Value* conversionFactor;
1332
1333 switch (conversionType)
1334 {
1335 case CONVERT_NORMALIZED:
1336 fpCast = Instruction::CastOps::UIToFP;
1337 conversionFactor = VIMMED1((float)(1.0 / 255.0));
1338 break;
1339 case CONVERT_USCALED:
1340 fpCast = Instruction::CastOps::UIToFP;
1341 conversionFactor = VIMMED1((float)(1.0));
1342 break;
1343 case CONVERT_SSCALED:
1344 SWR_ASSERT(0, "Type should not be zero extended!");
1345 conversionFactor = nullptr;
1346 break;
1347 default:
1348 SWR_ASSERT(conversionType == CONVERT_NONE);
1349 conversionFactor = nullptr;
1350 break;
1351 }
1352
1353 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1354 for (uint32_t i = 0; i < 4; i++)
1355 {
1356 if (isComponentEnabled(compMask, i))
1357 {
1358 // check for InstanceID SGV
1359 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1360 {
1361 // Load a SIMD of InstanceIDs
1362 vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance }))); // InstanceID
1363 }
1364 // check for VertexID SGV
1365 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1366 {
1367 // Load a SIMD of VertexIDs
1368 vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1369 }
1370 else if (compCtrl[i] == ComponentControl::StoreSrc)
1371 {
1372 // pshufb masks for each component
1373 Value* vConstMask;
1374 switch (swizzle[i])
1375 {
1376 case 0:
1377 // x shuffle mask
1378 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1379 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1380 break;
1381 case 1:
1382 // y shuffle mask
1383 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1384 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1385 break;
1386 case 2:
1387 // z shuffle mask
1388 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1389 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1390 break;
1391 case 3:
1392 // w shuffle mask
1393 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1394 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1395 break;
1396 default:
1397 vConstMask = nullptr;
1398 break;
1399 }
1400
1401 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1402 // after pshufb for x channel
1403 // 256i - 0 1 2 3 4 5 6 7
1404 // x000 x000 x000 x000 x000 x000 x000 x000
1405
1406 // denormalize if needed
1407 if (conversionType != CONVERT_NONE)
1408 {
1409 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1410 }
1411 currentVertexElement++;
1412 }
1413 else
1414 {
1415 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1416 }
1417
1418 if (currentVertexElement > 3)
1419 {
1420 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1421 // reset to the next vVertexElement to output
1422 currentVertexElement = 0;
1423 }
1424 }
1425 }
1426 }
1427 else
1428 {
1429 SWR_ASSERT(0, "Unsupported conversion type");
1430 }
1431 }
1432
1433 //////////////////////////////////////////////////////////////////////////
1434 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1435 /// denormalizes if needed, converts to F32 if needed, and positions in
1436 // the proper SIMD rows to be output to the simdvertex structure
1437 /// @param args: (tuple of args, listed below)
1438 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1439 /// @param pVtxOut - base pointer to output simdvertex struct
1440 /// @param extendType - sign extend or zero extend
1441 /// @param bNormalized - do we need to denormalize?
1442 /// @param currentVertexElement - reference to the current vVertexElement
1443 /// @param outputElt - reference to the current offset from simdvertex we're o
1444 /// @param compMask - component packing mask
1445 /// @param compCtrl - component control val
1446 /// @param vVertexElements[4] - vertex components to output
1447 /// @param fetchInfo - fetch shader info
1448 /// @param instanceIdEnable - InstanceID enabled?
1449 /// @param instanceIdComponentNumber - InstanceID component override
1450 /// @param vertexIdEnable - VertexID enabled?
1451 /// @param vertexIdComponentNumber - VertexID component override
1452 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1453 {
1454 // Unpack tuple args
1455 Value* (&vGatherResult)[2] = std::get<0>(args);
1456 Value* pVtxOut = std::get<1>(args);
1457 const Instruction::CastOps extendType = std::get<2>(args);
1458 const ConversionType conversionType = std::get<3>(args);
1459 uint32_t &currentVertexElement = std::get<4>(args);
1460 uint32_t &outputElt = std::get<5>(args);
1461 const ComponentEnable compMask = std::get<6>(args);
1462 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1463 Value* (&vVertexElements)[4] = std::get<8>(args);
1464 Value *fetchInfo = std::get<9>(args);
1465 const bool instanceIdEnable = std::get<10>(args);
1466 const uint32_t instanceIdComponentNumber = std::get<11>(args);
1467 const bool vertexIdEnable = std::get<12>(args);
1468 const uint32_t vertexIdComponentNumber = std::get<13>(args);
1469
1470 // cast types
1471 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1472 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1473
1474 // have to do extra work for sign extending
1475 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1476 (extendType == Instruction::CastOps::FPExt))
1477 {
1478 // is this PP float?
1479 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1480
1481 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1482 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1483
1484 // shuffle mask
1485 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1486 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1487 Value* vi128XY = nullptr;
1488 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1489 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1490 // after pshufb: group components together in each 128bit lane
1491 // 256i - 0 1 2 3 4 5 6 7
1492 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1493
1494 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1495 // after PERMD: move and pack xy components into each 128bit lane
1496 // 256i - 0 1 2 3 4 5 6 7
1497 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1498 }
1499
1500 // do the same for zw components
1501 Value* vi128ZW = nullptr;
1502 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1503 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1504 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1505 }
1506
1507 // init denormalize variables if needed
1508 Instruction::CastOps IntToFpCast;
1509 Value* conversionFactor;
1510
1511 switch (conversionType)
1512 {
1513 case CONVERT_NORMALIZED:
1514 IntToFpCast = Instruction::CastOps::SIToFP;
1515 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1516 break;
1517 case CONVERT_SSCALED:
1518 IntToFpCast = Instruction::CastOps::SIToFP;
1519 conversionFactor = VIMMED1((float)(1.0));
1520 break;
1521 case CONVERT_USCALED:
1522 SWR_ASSERT(0, "Type should not be sign extended!");
1523 conversionFactor = nullptr;
1524 break;
1525 default:
1526 SWR_ASSERT(conversionType == CONVERT_NONE);
1527 conversionFactor = nullptr;
1528 break;
1529 }
1530
1531 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1532 for (uint32_t i = 0; i < 4; i++)
1533 {
1534 if (isComponentEnabled(compMask, i))
1535 {
1536 // check for InstanceID SGV
1537 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1538 {
1539 // Load a SIMD of InstanceIDs
1540 vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance }))); // InstanceID
1541 }
1542 // check for VertexID SGV
1543 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1544 {
1545 // Load a SIMD of VertexIDs
1546 vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1547 }
1548 else if (compCtrl[i] == ComponentControl::StoreSrc)
1549 {
1550 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1551 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1552 // if x or y, use vi128XY permute result, else use vi128ZW
1553 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1554
1555 if (bFP) {
1556 // extract 128 bit lanes to sign extend each component
1557 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1558 }
1559 else {
1560 // extract 128 bit lanes to sign extend each component
1561 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1562
1563 // denormalize if needed
1564 if (conversionType != CONVERT_NONE) {
1565 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1566 }
1567 }
1568 currentVertexElement++;
1569 }
1570 else
1571 {
1572 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1573 }
1574
1575 if (currentVertexElement > 3)
1576 {
1577 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1578 // reset to the next vVertexElement to output
1579 currentVertexElement = 0;
1580 }
1581 }
1582 }
1583 }
1584 // else zero extend
1585 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1586 {
1587 // pshufb masks for each component
1588 Value* vConstMask[2];
1589 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1590 // x/z shuffle mask
1591 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1592 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1593 }
1594
1595 if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1596 // y/w shuffle mask
1597 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1598 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1599 }
1600
1601 // init denormalize variables if needed
1602 Instruction::CastOps fpCast;
1603 Value* conversionFactor;
1604
1605 switch (conversionType)
1606 {
1607 case CONVERT_NORMALIZED:
1608 fpCast = Instruction::CastOps::UIToFP;
1609 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1610 break;
1611 case CONVERT_USCALED:
1612 fpCast = Instruction::CastOps::UIToFP;
1613 conversionFactor = VIMMED1((float)(1.0f));
1614 break;
1615 case CONVERT_SSCALED:
1616 SWR_ASSERT(0, "Type should not be zero extended!");
1617 conversionFactor = nullptr;
1618 break;
1619 default:
1620 SWR_ASSERT(conversionType == CONVERT_NONE);
1621 conversionFactor = nullptr;
1622 break;
1623 }
1624
1625 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1626 for (uint32_t i = 0; i < 4; i++)
1627 {
1628 if (isComponentEnabled(compMask, i))
1629 {
1630 // check for InstanceID SGV
1631 if (instanceIdEnable && (instanceIdComponentNumber == currentVertexElement))
1632 {
1633 // Load a SIMD of InstanceIDs
1634 vVertexElements[currentVertexElement++] = VBROADCAST(LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance }))); // InstanceID
1635 }
1636 // check for VertexID SGV
1637 else if (vertexIdEnable && (vertexIdComponentNumber == currentVertexElement))
1638 {
1639 // Load a SIMD of VertexIDs
1640 vVertexElements[currentVertexElement++] = LOAD(GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
1641 }
1642 else if (compCtrl[i] == ComponentControl::StoreSrc)
1643 {
1644 // select correct constMask for x/z or y/w pshufb
1645 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1646 // if x or y, use vi128XY permute result, else use vi128ZW
1647 uint32_t selectedGather = (i < 2) ? 0 : 1;
1648
1649 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1650 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1651 // 256i - 0 1 2 3 4 5 6 7
1652 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1653
1654 // denormalize if needed
1655 if (conversionType != CONVERT_NONE)
1656 {
1657 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1658 }
1659 currentVertexElement++;
1660 }
1661 else
1662 {
1663 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1664 }
1665
1666 if (currentVertexElement > 3)
1667 {
1668 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1669 // reset to the next vVertexElement to output
1670 currentVertexElement = 0;
1671 }
1672 }
1673 }
1674 }
1675 else
1676 {
1677 SWR_ASSERT(0, "Unsupported conversion type");
1678 }
1679 }
1680
1681 //////////////////////////////////////////////////////////////////////////
1682 /// @brief Output a simdvertex worth of elements to the current outputElt
1683 /// @param pVtxOut - base address of VIN output struct
1684 /// @param outputElt - simdvertex offset in VIN to write to
1685 /// @param numEltsToStore - number of simdvertex rows to write out
1686 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1687 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1688 {
1689 for(uint32_t c = 0; c < numEltsToStore; ++c)
1690 {
1691 // STORE expects FP32 x vWidth type, just bitcast if needed
1692 if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1693 #if FETCH_DUMP_VERTEX
1694 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1695 #endif
1696 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1697 }
1698 #if FETCH_DUMP_VERTEX
1699 else
1700 {
1701 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1702 }
1703 #endif
1704 // outputElt * 4 = offsetting by the size of a simdvertex
1705 // + c offsets to a 32bit x vWidth row within the current vertex
1706 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1707 STORE(vVertexElements[c], dest);
1708 }
1709 }
1710
1711 //////////////////////////////////////////////////////////////////////////
1712 /// @brief Generates a constant vector of values based on the
1713 /// ComponentControl value
1714 /// @param ctrl - ComponentControl value
1715 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1716 {
1717 switch(ctrl)
1718 {
1719 case NoStore: return VUNDEF_I();
1720 case Store0: return VIMMED1(0);
1721 case Store1Fp: return VIMMED1(1.0f);
1722 case Store1Int: return VIMMED1(1);
1723 case StoreSrc:
1724 default: SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1725 }
1726 }
1727
1728 //////////////////////////////////////////////////////////////////////////
1729 /// @brief Returns the enable mask for the specified component.
1730 /// @param enableMask - enable bits
1731 /// @param component - component to check if enabled.
1732 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1733 {
1734 switch (component)
1735 {
1736 // X
1737 case 0: return (enableMask & ComponentEnable::X);
1738 // Y
1739 case 1: return (enableMask & ComponentEnable::Y);
1740 // Z
1741 case 2: return (enableMask & ComponentEnable::Z);
1742 // W
1743 case 3: return (enableMask & ComponentEnable::W);
1744
1745 default: return false;
1746 }
1747 }
1748
1749
1750 //////////////////////////////////////////////////////////////////////////
1751 /// @brief JITs from fetch shader IR
1752 /// @param hJitMgr - JitManager handle
1753 /// @param func - LLVM function IR
1754 /// @return PFN_FETCH_FUNC - pointer to fetch code
1755 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1756 {
1757 const llvm::Function* func = (const llvm::Function*)hFunc;
1758 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1759 PFN_FETCH_FUNC pfnFetch;
1760
1761 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1762 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1763 pJitMgr->mIsModuleFinalized = true;
1764
1765 #if defined(KNOB_SWRC_TRACING)
1766 char fName[1024];
1767 const char *funcName = func->getName().data();
1768 sprintf(fName, "%s.bin", funcName);
1769 FILE *fd = fopen(fName, "wb");
1770 fwrite((void *)pfnFetch, 1, 2048, fd);
1771 fclose(fd);
1772 #endif
1773
1774 return pfnFetch;
1775 }
1776
1777 //////////////////////////////////////////////////////////////////////////
1778 /// @brief JIT compiles fetch shader
1779 /// @param hJitMgr - JitManager handle
1780 /// @param state - fetch state to build function from
1781 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1782 {
1783 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1784
1785 pJitMgr->SetupNewModule();
1786
1787 FetchJit theJit(pJitMgr);
1788 HANDLE hFunc = theJit.Create(state);
1789
1790 return JitFetchFunc(hJitMgr, hFunc);
1791 }