swr: [rasterizer fetch] add support for 24bit format fetch
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_api.h"
31 #include "fetch_jit.h"
32 #include "builder.h"
33 #include "state_llvm.h"
34 #include <sstream>
35 #include <tuple>
36
37 //#define FETCH_DUMP_VERTEX 1
38
39 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
40
41 enum ConversionType
42 {
43 CONVERT_NONE,
44 CONVERT_NORMALIZED,
45 CONVERT_USCALED,
46 CONVERT_SSCALED,
47 };
48
49 //////////////////////////////////////////////////////////////////////////
50 /// Interface to Jitting a fetch shader
51 //////////////////////////////////////////////////////////////////////////
52 struct FetchJit : public Builder
53 {
54 FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
55
56 Function* Create(const FETCH_COMPILE_STATE& fetchState);
57 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
58 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
59 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
60
61 // package up Shuffle*bpcGatherd args into a tuple for convenience
62 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
63 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
64 const uint32_t(&)[4]> Shuffle8bpcArgs;
65 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
66
67 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
68 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
69 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
70
71 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
72
73 Value* GenerateCompCtrlVector(const ComponentControl ctrl);
74
75 void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
76 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
77
78 bool IsOddFormat(SWR_FORMAT format);
79 bool IsUniformFormat(SWR_FORMAT format);
80 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
81 void CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4]);
82 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
83
84 Value* mpFetchInfo;
85 };
86
87 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
88 {
89 static std::size_t fetchNum = 0;
90
91 std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
92 fnName << fetchNum++;
93
94 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
95 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
96
97 IRB()->SetInsertPoint(entry);
98
99 auto argitr = fetch->getArgumentList().begin();
100
101 // Fetch shader arguments
102 mpFetchInfo = &*argitr; ++argitr;
103 mpFetchInfo->setName("fetchInfo");
104 Value* pVtxOut = &*argitr;
105 pVtxOut->setName("vtxOutput");
106 // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
107 // index 0(just the pointer to the simdvertex structure
108 // index 1(which element of the simdvertex structure to offset to(in this case 0)
109 // so the indices being i32's doesn't matter
110 // TODO: generated this GEP with a VECTOR structure type so this makes sense
111 std::vector<Value*> vtxInputIndices(2, C(0));
112 // GEP
113 pVtxOut = GEP(pVtxOut, C(0));
114 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
115
116 // SWR_FETCH_CONTEXT::pStreams
117 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
118 streams->setName("pStreams");
119
120 // SWR_FETCH_CONTEXT::pIndices
121 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
122 indices->setName("pIndices");
123
124 // SWR_FETCH_CONTEXT::pLastIndex
125 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
126 pLastIndex->setName("pLastIndex");
127
128
129 Value* vIndices;
130 switch(fetchState.indexType)
131 {
132 case R8_UINT:
133 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
134 if(fetchState.bDisableIndexOOBCheck){
135 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
136 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
137 }
138 else{
139 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
140 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
141 }
142 break;
143 case R16_UINT:
144 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
145 if(fetchState.bDisableIndexOOBCheck){
146 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
147 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
148 }
149 else{
150 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
151 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
152 }
153 break;
154 case R32_UINT:
155 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
156 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
157 break; // incoming type is already 32bit int
158 default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
159 }
160
161 // store out vertex IDs
162 STORE(vIndices, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
163
164 // store out cut mask if enabled
165 if (fetchState.bEnableCutIndex)
166 {
167 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
168 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
169 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
170 }
171
172 // Fetch attributes from memory and output to a simdvertex struct
173 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
174 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
175 : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
176
177 RET_VOID();
178
179 JitManager::DumpToFile(fetch, "src");
180
181 #if defined(_DEBUG)
182 verifyFunction(*fetch);
183 #endif
184
185 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
186
187 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
188 setupPasses.add(createBreakCriticalEdgesPass());
189 setupPasses.add(createCFGSimplificationPass());
190 setupPasses.add(createEarlyCSEPass());
191 setupPasses.add(createPromoteMemoryToRegisterPass());
192
193 setupPasses.run(*fetch);
194
195 JitManager::DumpToFile(fetch, "se");
196
197 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
198
199 ///@todo Haven't touched these either. Need to remove some of these and add others.
200 optPasses.add(createCFGSimplificationPass());
201 optPasses.add(createEarlyCSEPass());
202 optPasses.add(createInstructionCombiningPass());
203 optPasses.add(createInstructionSimplifierPass());
204 optPasses.add(createConstantPropagationPass());
205 optPasses.add(createSCCPPass());
206 optPasses.add(createAggressiveDCEPass());
207
208 optPasses.run(*fetch);
209 optPasses.run(*fetch);
210
211 JitManager::DumpToFile(fetch, "opt");
212
213 return fetch;
214 }
215
216 //////////////////////////////////////////////////////////////////////////
217 /// @brief Loads attributes from memory using LOADs, shuffling the
218 /// components into SOA form.
219 /// *Note* currently does not support component control,
220 /// component packing, instancing
221 /// @param fetchState - info about attributes to be fetched from memory
222 /// @param streams - value pointer to the current vertex stream
223 /// @param vIndices - vector value of indices to load
224 /// @param pVtxOut - value pointer to output simdvertex struct
225 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
226 {
227 // Zack shuffles; a variant of the Charleston.
228
229 std::vector<Value*> vectors(16);
230 std::vector<Constant*> pMask(mVWidth);
231 for(uint32_t i = 0; i < mVWidth; ++i)
232 {
233 pMask[i] = (C(i < 4 ? i : 4));
234 }
235 Constant* promoteMask = ConstantVector::get(pMask);
236 Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
237
238 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
239 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
240 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
241 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
242 curInstance->setName("curInstance");
243
244 for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
245 {
246 Value* elements[4] = {0};
247 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
248 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
249 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
250 uint32_t numComponents = info.numComps;
251 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
252
253 // load path doesn't support component packing
254 SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
255
256 vectors.clear();
257
258 Value *vCurIndices;
259 Value *startOffset;
260 if(ied.InstanceEnable)
261 {
262 Value* stepRate = C(ied.InstanceDataStepRate);
263
264 // prevent a div by 0 for 0 step rate
265 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
266 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
267
268 // calc the current offset into instanced data buffer
269 Value* calcInstance = UDIV(curInstance, stepRate);
270
271 // if step rate is 0, every instance gets instance 0
272 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
273
274 vCurIndices = VBROADCAST(calcInstance);
275
276 startOffset = startInstance;
277 }
278 else
279 {
280 // offset indices by baseVertex
281 vCurIndices = ADD(vIndices, vBaseVertex);
282
283 startOffset = startVertex;
284 }
285
286 // load SWR_VERTEX_BUFFER_STATE::pData
287 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
288
289 // load SWR_VERTEX_BUFFER_STATE::pitch
290 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
291 stride = Z_EXT(stride, mInt64Ty);
292
293 // load SWR_VERTEX_BUFFER_STATE::size
294 Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
295 size = Z_EXT(size, mInt64Ty);
296
297 Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
298
299 // Load from the stream.
300 for(uint32_t lane = 0; lane < mVWidth; ++lane)
301 {
302 // Get index
303 Value* index = VEXTRACT(vCurIndices, C(lane));
304 index = Z_EXT(index, mInt64Ty);
305
306 Value* offset = MUL(index, stride);
307 offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
308 offset = ADD(offset, startVertexOffset);
309
310 if (!fetchState.bDisableIndexOOBCheck) {
311 // check for out of bound access, including partial OOB, and mask them to 0
312 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
313 Value *oob = ICMP_ULE(endOffset, size);
314 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
315 }
316
317 Value* pointer = GEP(stream, offset);
318 // We use a full-lane, but don't actually care.
319 Value* vptr = 0;
320
321 // get a pointer to a 4 component attrib in default address space
322 switch(bpc)
323 {
324 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
325 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
326 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
327 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
328 }
329
330 // load 4 components of attribute
331 Value* vec = ALIGNED_LOAD(vptr, 1, false);
332
333 // Convert To FP32 internally
334 switch(info.type[0])
335 {
336 case SWR_TYPE_UNORM:
337 switch(bpc)
338 {
339 case 8:
340 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
341 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
342 break;
343 case 16:
344 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
345 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
346 break;
347 default:
348 SWR_ASSERT(false, "Unsupported underlying type!");
349 break;
350 }
351 break;
352 case SWR_TYPE_SNORM:
353 switch(bpc)
354 {
355 case 8:
356 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
357 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
358 break;
359 case 16:
360 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
361 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
362 break;
363 default:
364 SWR_ASSERT(false, "Unsupported underlying type!");
365 break;
366 }
367 break;
368 case SWR_TYPE_UINT:
369 // Zero extend uint32_t types.
370 switch(bpc)
371 {
372 case 8:
373 case 16:
374 vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
375 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
376 break;
377 case 32:
378 break; // Pass through unchanged.
379 default:
380 SWR_ASSERT(false, "Unsupported underlying type!");
381 break;
382 }
383 break;
384 case SWR_TYPE_SINT:
385 // Sign extend SINT types.
386 switch(bpc)
387 {
388 case 8:
389 case 16:
390 vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
391 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
392 break;
393 case 32:
394 break; // Pass through unchanged.
395 default:
396 SWR_ASSERT(false, "Unsupported underlying type!");
397 break;
398 }
399 break;
400 case SWR_TYPE_FLOAT:
401 switch(bpc)
402 {
403 case 32:
404 break; // Pass through unchanged.
405 default:
406 SWR_ASSERT(false, "Unsupported underlying type!");
407 }
408 break;
409 case SWR_TYPE_USCALED:
410 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
411 break;
412 case SWR_TYPE_SSCALED:
413 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
414 break;
415 case SWR_TYPE_UNKNOWN:
416 case SWR_TYPE_UNUSED:
417 SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
418 }
419
420 // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
421 // uwvec: 4 x F32, undef value
422 Value* wvec = VSHUFFLE(vec, uwvec, promoteMask);
423 vectors.push_back(wvec);
424 }
425
426 std::vector<Constant*> v01Mask(mVWidth);
427 std::vector<Constant*> v23Mask(mVWidth);
428 std::vector<Constant*> v02Mask(mVWidth);
429 std::vector<Constant*> v13Mask(mVWidth);
430
431 // Concatenate the vectors together.
432 elements[0] = VUNDEF_F();
433 elements[1] = VUNDEF_F();
434 elements[2] = VUNDEF_F();
435 elements[3] = VUNDEF_F();
436 for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
437 {
438 v01Mask[4 * b + 0] = C(0 + 4 * b);
439 v01Mask[4 * b + 1] = C(1 + 4 * b);
440 v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
441 v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
442
443 v23Mask[4 * b + 0] = C(2 + 4 * b);
444 v23Mask[4 * b + 1] = C(3 + 4 * b);
445 v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
446 v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
447
448 v02Mask[4 * b + 0] = C(0 + 4 * b);
449 v02Mask[4 * b + 1] = C(2 + 4 * b);
450 v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
451 v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
452
453 v13Mask[4 * b + 0] = C(1 + 4 * b);
454 v13Mask[4 * b + 1] = C(3 + 4 * b);
455 v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
456 v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
457
458 std::vector<Constant*> iMask(mVWidth);
459 for(uint32_t i = 0; i < mVWidth; ++i)
460 {
461 if(((4 * b) <= i) && (i < (4 * (b + 1))))
462 {
463 iMask[i] = C(i % 4 + mVWidth);
464 }
465 else
466 {
467 iMask[i] = C(i);
468 }
469 }
470 Constant* insertMask = ConstantVector::get(iMask);
471 elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
472 elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
473 elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
474 elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
475 }
476
477 Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
478 Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
479 Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
480 Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
481 elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
482 elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
483 elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
484 elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
485
486 switch(numComponents + 1)
487 {
488 case 1: elements[0] = VIMMED1(0.0f);
489 case 2: elements[1] = VIMMED1(0.0f);
490 case 3: elements[2] = VIMMED1(0.0f);
491 case 4: elements[3] = VIMMED1(1.0f);
492 }
493
494 for(uint32_t c = 0; c < 4; ++c)
495 {
496 Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
497 STORE(elements[c], dest);
498 }
499 }
500 }
501
502 // returns true for odd formats that require special state.gather handling
503 bool FetchJit::IsOddFormat(SWR_FORMAT format)
504 {
505 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
506 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
507 {
508 return true;
509 }
510 return false;
511 }
512
513 // format is uniform if all components are the same size and type
514 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
515 {
516 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
517 uint32_t bpc0 = info.bpc[0];
518 uint32_t type0 = info.type[0];
519
520 for (uint32_t c = 1; c < info.numComps; ++c)
521 {
522 if (bpc0 != info.bpc[c] || type0 != info.type[c])
523 {
524 return false;
525 }
526 }
527 return true;
528 }
529
530 // unpacks components based on format
531 // foreach component in the pixel
532 // mask off everything but this component
533 // shift component to LSB
534 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
535 {
536 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
537
538 uint32_t bitOffset = 0;
539 for (uint32_t c = 0; c < info.numComps; ++c)
540 {
541 uint32_t swizzledIndex = info.swizzle[c];
542 uint32_t compBits = info.bpc[c];
543 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
544 Value* comp = AND(vInput, bitmask);
545 comp = LSHR(comp, bitOffset);
546
547 result[swizzledIndex] = comp;
548 bitOffset += compBits;
549 }
550 }
551
552 // gather for odd component size formats
553 // gather SIMD full pixels per lane then shift/mask to move each component to their
554 // own vector
555 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4])
556 {
557 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
558
559 // only works if pixel size is <= 32bits
560 SWR_ASSERT(info.bpp <= 32);
561
562 Value* gather = VUNDEF_I();
563
564 // assign defaults
565 for (uint32_t comp = 0; comp < 4; ++comp)
566 {
567 result[comp] = VIMMED1((int)info.defaults[comp]);
568 }
569
570 // gather SIMD pixels
571 for (uint32_t e = 0; e < JM()->mVWidth; ++e)
572 {
573 Value* elemOffset = VEXTRACT(offsets, C(e));
574 Value* load = GEP(pBase, elemOffset);
575
576 // load the proper amount of data based on component size
577 switch (info.bpp)
578 {
579 case 8: load = POINTER_CAST(load, Type::getInt8PtrTy(JM()->mContext)); break;
580 case 16: load = POINTER_CAST(load, Type::getInt16PtrTy(JM()->mContext)); break;
581 case 24:
582 case 32: load = POINTER_CAST(load, Type::getInt32PtrTy(JM()->mContext)); break;
583 default: SWR_ASSERT(0);
584 }
585
586 // load pixel
587 Value *val = LOAD(load);
588
589 // zero extend to 32bit integer
590 val = INT_CAST(val, mInt32Ty, false);
591
592 // store in simd lane
593 gather = VINSERT(gather, val, C(e));
594 }
595
596 UnpackComponents(format, gather, result);
597
598 // cast to fp32
599 result[0] = BITCAST(result[0], mSimdFP32Ty);
600 result[1] = BITCAST(result[1], mSimdFP32Ty);
601 result[2] = BITCAST(result[2], mSimdFP32Ty);
602 result[3] = BITCAST(result[3], mSimdFP32Ty);
603 }
604
605 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
606 {
607 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
608
609 for (uint32_t c = 0; c < info.numComps; ++c)
610 {
611 uint32_t compIndex = info.swizzle[c];
612
613 // skip any conversion on UNUSED components
614 if (info.type[c] == SWR_TYPE_UNUSED)
615 {
616 continue;
617 }
618
619 if (info.isNormalized[c])
620 {
621 if (info.type[c] == SWR_TYPE_SNORM)
622 {
623 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
624
625 /// result = c * (1.0f / (2^(n-1) - 1);
626 uint32_t n = info.bpc[c];
627 uint32_t pow2 = 1 << (n - 1);
628 float scale = 1.0f / (float)(pow2 - 1);
629 Value *vScale = VIMMED1(scale);
630 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
631 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
632 texels[compIndex] = FMUL(texels[compIndex], vScale);
633 }
634 else
635 {
636 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
637
638 /// result = c * (1.0f / (2^n - 1))
639 uint32_t n = info.bpc[c];
640 uint32_t pow2 = 1 << n;
641 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
642 if (n == 24)
643 {
644 float scale = (float)(pow2 - 1);
645 Value* vScale = VIMMED1(scale);
646 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
647 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
648 texels[compIndex] = FDIV(texels[compIndex], vScale);
649 }
650 else
651 {
652 float scale = 1.0f / (float)(pow2 - 1);
653 Value *vScale = VIMMED1(scale);
654 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
655 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
656 texels[compIndex] = FMUL(texels[compIndex], vScale);
657 }
658 }
659 continue;
660 }
661 }
662 }
663
664 //////////////////////////////////////////////////////////////////////////
665 /// @brief Loads attributes from memory using AVX2 GATHER(s)
666 /// @param fetchState - info about attributes to be fetched from memory
667 /// @param streams - value pointer to the current vertex stream
668 /// @param vIndices - vector value of indices to gather
669 /// @param pVtxOut - value pointer to output simdvertex struct
670 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
671 Value* streams, Value* vIndices, Value* pVtxOut)
672 {
673 uint32_t currentVertexElement = 0;
674 uint32_t outputElt = 0;
675 Value* vVertexElements[4];
676
677 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
678 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
679 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
680 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
681 curInstance->setName("curInstance");
682
683 for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
684 {
685 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
686
687 // skip element if all components are disabled
688 if (ied.ComponentPacking == ComponentEnable::NONE)
689 {
690 continue;
691 }
692
693 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
694 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
695 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
696
697 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
698
699 // VGATHER* takes an *i8 src pointer
700 Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
701
702 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
703 Value *vStride = VBROADCAST(stride);
704
705 // max vertex index that is fully in bounds
706 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
707 maxVertex = LOAD(maxVertex);
708
709 Value *vCurIndices;
710 Value *startOffset;
711 if(ied.InstanceEnable)
712 {
713 Value* stepRate = C(ied.InstanceDataStepRate);
714
715 // prevent a div by 0 for 0 step rate
716 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
717 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
718
719 // calc the current offset into instanced data buffer
720 Value* calcInstance = UDIV(curInstance, stepRate);
721
722 // if step rate is 0, every instance gets instance 0
723 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
724
725 vCurIndices = VBROADCAST(calcInstance);
726
727 startOffset = startInstance;
728 }
729 else
730 {
731 // offset indices by baseVertex
732 vCurIndices = ADD(vIndices, vBaseVertex);
733
734 startOffset = startVertex;
735 }
736
737 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
738 // do 64bit address offset calculations.
739
740 // calculate byte offset to the start of the VB
741 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
742 pStreamBase = GEP(pStreamBase, baseOffset);
743
744 // if we have a start offset, subtract from max vertex. Used for OOB check
745 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
746 Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
747 // if we have a negative value, we're already OOB. clamp at 0.
748 maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
749
750 // Load the in bounds size of a partially valid vertex
751 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
752 partialInboundsSize = LOAD(partialInboundsSize);
753 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
754 Value* vBpp = VBROADCAST(C(info.Bpp));
755 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
756
757 // is the element is <= the partially valid size
758 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
759
760 // override cur indices with 0 if pitch is 0
761 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
762 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
763
764 // are vertices partially OOB?
765 Value* vMaxVertex = VBROADCAST(maxVertex);
766 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
767
768 // are vertices are fully in bounds?
769 Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
770
771 // blend in any partially OOB indices that have valid elements
772 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
773 vGatherMask = VMASK(vGatherMask);
774
775 // calculate the actual offsets into the VB
776 Value* vOffsets = MUL(vCurIndices, vStride);
777 vOffsets = ADD(vOffsets, vAlignmentOffsets);
778
779 // Packing and component control
780 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
781 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
782 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
783
784 // Special gather/conversion for formats without equal component sizes
785 if (IsOddFormat((SWR_FORMAT)ied.Format))
786 {
787 Value* pResults[4];
788 CreateGatherOddFormats((SWR_FORMAT)ied.Format, pStreamBase, vOffsets, pResults);
789 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
790
791 for (uint32_t c = 0; c < 4; ++c)
792 {
793 if (isComponentEnabled(compMask, c))
794 {
795 vVertexElements[currentVertexElement++] = pResults[c];
796 if (currentVertexElement > 3)
797 {
798 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
799 // reset to the next vVertexElement to output
800 currentVertexElement = 0;
801 }
802 }
803 }
804 }
805 else if(info.type[0] == SWR_TYPE_FLOAT)
806 {
807 ///@todo: support 64 bit vb accesses
808 Value* gatherSrc = VIMMED1(0.0f);
809
810 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
811 "Unsupported format for standard gather fetch.");
812
813 // Gather components from memory to store in a simdvertex structure
814 switch(bpc)
815 {
816 case 16:
817 {
818 Value* vGatherResult[2];
819 Value *vMask;
820
821 // if we have at least one component out of x or y to fetch
822 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
823 // save mask as it is zero'd out after each gather
824 vMask = vGatherMask;
825
826 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
827 // e.g. result of first 8x32bit integer gather for 16bit components
828 // 256i - 0 1 2 3 4 5 6 7
829 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
830 //
831 }
832
833 // if we have at least one component out of z or w to fetch
834 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
835 // offset base to the next components(zw) in the vertex to gather
836 pStreamBase = GEP(pStreamBase, C((char)4));
837 vMask = vGatherMask;
838
839 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
840 // e.g. result of second 8x32bit integer gather for 16bit components
841 // 256i - 0 1 2 3 4 5 6 7
842 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
843 //
844 }
845
846 // if we have at least one component to shuffle into place
847 if(compMask){
848 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
849 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
850
851 // Shuffle gathered components into place in simdvertex struct
852 Shuffle16bpcGather(args); // outputs to vVertexElements ref
853 }
854 }
855 break;
856 case 32:
857 {
858 for (uint32_t i = 0; i < 4; i++)
859 {
860 if (isComponentEnabled(compMask, i))
861 {
862 // if we need to gather the component
863 if (compCtrl[i] == StoreSrc)
864 {
865 // save mask as it is zero'd out after each gather
866 Value *vMask = vGatherMask;
867
868 // Gather a SIMD of vertices
869 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
870 }
871 else
872 {
873 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
874 }
875
876 if (currentVertexElement > 3)
877 {
878 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
879 // reset to the next vVertexElement to output
880 currentVertexElement = 0;
881 }
882
883 }
884
885 // offset base to the next component in the vertex to gather
886 pStreamBase = GEP(pStreamBase, C((char)4));
887 }
888 }
889 break;
890 default:
891 SWR_ASSERT(0, "Tried to fetch invalid FP format");
892 break;
893 }
894 }
895 else
896 {
897 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
898 ConversionType conversionType = CONVERT_NONE;
899
900 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
901 "Unsupported format for standard gather fetch.");
902
903 switch(info.type[0])
904 {
905 case SWR_TYPE_UNORM:
906 conversionType = CONVERT_NORMALIZED;
907 case SWR_TYPE_UINT:
908 extendCastType = Instruction::CastOps::ZExt;
909 break;
910 case SWR_TYPE_SNORM:
911 conversionType = CONVERT_NORMALIZED;
912 case SWR_TYPE_SINT:
913 extendCastType = Instruction::CastOps::SExt;
914 break;
915 case SWR_TYPE_USCALED:
916 conversionType = CONVERT_USCALED;
917 extendCastType = Instruction::CastOps::UIToFP;
918 break;
919 case SWR_TYPE_SSCALED:
920 conversionType = CONVERT_SSCALED;
921 extendCastType = Instruction::CastOps::SIToFP;
922 break;
923 default:
924 break;
925 }
926
927 // value substituted when component of gather is masked
928 Value* gatherSrc = VIMMED1(0);
929
930 // Gather components from memory to store in a simdvertex structure
931 switch (bpc)
932 {
933 case 8:
934 {
935 // if we have at least one component to fetch
936 if(compMask)
937 {
938 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
939 // e.g. result of an 8x32bit integer gather for 8bit components
940 // 256i - 0 1 2 3 4 5 6 7
941 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
942
943 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
944 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
945
946 // Shuffle gathered components into place in simdvertex struct
947 Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
948 }
949 }
950 break;
951 case 16:
952 {
953 Value* vGatherResult[2];
954 Value *vMask;
955
956 // if we have at least one component out of x or y to fetch
957 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
958 // save mask as it is zero'd out after each gather
959 vMask = vGatherMask;
960
961 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
962 // e.g. result of first 8x32bit integer gather for 16bit components
963 // 256i - 0 1 2 3 4 5 6 7
964 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
965 //
966 }
967
968 // if we have at least one component out of z or w to fetch
969 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
970 // offset base to the next components(zw) in the vertex to gather
971 pStreamBase = GEP(pStreamBase, C((char)4));
972 vMask = vGatherMask;
973
974 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
975 // e.g. result of second 8x32bit integer gather for 16bit components
976 // 256i - 0 1 2 3 4 5 6 7
977 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
978 //
979 }
980
981 // if we have at least one component to shuffle into place
982 if(compMask){
983 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
984 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
985
986 // Shuffle gathered components into place in simdvertex struct
987 Shuffle16bpcGather(args); // outputs to vVertexElements ref
988 }
989 }
990 break;
991 case 32:
992 {
993 // Gathered components into place in simdvertex struct
994 for (uint32_t i = 0; i < 4; i++)
995 {
996 if (isComponentEnabled(compMask, i))
997 {
998 // if we need to gather the component
999 if (compCtrl[i] == StoreSrc)
1000 {
1001 // save mask as it is zero'd out after each gather
1002 Value *vMask = vGatherMask;
1003
1004 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1005
1006 if (conversionType == CONVERT_USCALED)
1007 {
1008 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1009 }
1010 else if (conversionType == CONVERT_SSCALED)
1011 {
1012 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1013 }
1014
1015 vVertexElements[currentVertexElement++] = pGather;
1016 // e.g. result of a single 8x32bit integer gather for 32bit components
1017 // 256i - 0 1 2 3 4 5 6 7
1018 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1019 }
1020 else
1021 {
1022 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1023 }
1024
1025 if (currentVertexElement > 3)
1026 {
1027 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1028 // reset to the next vVertexElement to output
1029 currentVertexElement = 0;
1030 }
1031
1032 }
1033
1034 // offset base to the next component in the vertex to gather
1035 pStreamBase = GEP(pStreamBase, C((char)4));
1036 }
1037 }
1038 break;
1039 }
1040 }
1041 }
1042
1043 // if we have a partially filled vVertexElement struct, output it
1044 if(currentVertexElement > 0){
1045 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1046 }
1047 }
1048
1049 //////////////////////////////////////////////////////////////////////////
1050 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1051 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1052 /// support
1053 /// @param pIndices - pointer to 8 bit indices
1054 /// @param pLastIndex - pointer to last valid index
1055 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1056 {
1057 // can fit 2 16 bit integers per vWidth lane
1058 Value* vIndices = VUNDEF_I();
1059
1060 // store 0 index on stack to be used to conditionally load from if index address is OOB
1061 Value* pZeroIndex = ALLOCA(mInt8Ty);
1062 STORE(C((uint8_t)0), pZeroIndex);
1063
1064 // Load a SIMD of index pointers
1065 for(int64_t lane = 0; lane < mVWidth; lane++)
1066 {
1067 // Calculate the address of the requested index
1068 Value *pIndex = GEP(pIndices, C(lane));
1069
1070 // check if the address is less than the max index,
1071 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1072
1073 // if valid, load the index. if not, load 0 from the stack
1074 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1075 Value *index = LOAD(pValid, "valid index");
1076
1077 // zero extended index to 32 bits and insert into the correct simd lane
1078 index = Z_EXT(index, mInt32Ty);
1079 vIndices = VINSERT(vIndices, index, lane);
1080 }
1081 return vIndices;
1082 }
1083
1084 //////////////////////////////////////////////////////////////////////////
1085 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1086 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1087 /// support
1088 /// @param pIndices - pointer to 16 bit indices
1089 /// @param pLastIndex - pointer to last valid index
1090 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1091 {
1092 // can fit 2 16 bit integers per vWidth lane
1093 Value* vIndices = VUNDEF_I();
1094
1095 // store 0 index on stack to be used to conditionally load from if index address is OOB
1096 Value* pZeroIndex = ALLOCA(mInt16Ty);
1097 STORE(C((uint16_t)0), pZeroIndex);
1098
1099 // Load a SIMD of index pointers
1100 for(int64_t lane = 0; lane < mVWidth; lane++)
1101 {
1102 // Calculate the address of the requested index
1103 Value *pIndex = GEP(pIndices, C(lane));
1104
1105 // check if the address is less than the max index,
1106 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1107
1108 // if valid, load the index. if not, load 0 from the stack
1109 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1110 Value *index = LOAD(pValid, "valid index");
1111
1112 // zero extended index to 32 bits and insert into the correct simd lane
1113 index = Z_EXT(index, mInt32Ty);
1114 vIndices = VINSERT(vIndices, index, lane);
1115 }
1116 return vIndices;
1117 }
1118
1119 //////////////////////////////////////////////////////////////////////////
1120 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1121 /// @param pIndices - pointer to 32 bit indices
1122 /// @param pLastIndex - pointer to last valid index
1123 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1124 {
1125 DataLayout dL(JM()->mpCurrentModule);
1126 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1127 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1128 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1129
1130 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1131 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1132 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1133 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1134
1135 // create a vector of index counts from the base index ptr passed into the fetch
1136 const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1137 Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1138
1139 // compare index count to the max valid index
1140 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1141 // vIndexOffsets 0 1 2 3 4 5 6 7
1142 // ------------------------------
1143 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1144 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1145 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1146 Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1147
1148 // VMASKLOAD takes an *i8 src pointer
1149 pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1150
1151 // Load the indices; OOB loads 0
1152 return MASKLOADD(pIndices,vIndexMask);
1153 }
1154
1155 //////////////////////////////////////////////////////////////////////////
1156 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1157 /// denormalizes if needed, converts to F32 if needed, and positions in
1158 // the proper SIMD rows to be output to the simdvertex structure
1159 /// @param args: (tuple of args, listed below)
1160 /// @param vGatherResult - 8 gathered 8bpc vertices
1161 /// @param pVtxOut - base pointer to output simdvertex struct
1162 /// @param extendType - sign extend or zero extend
1163 /// @param bNormalized - do we need to denormalize?
1164 /// @param currentVertexElement - reference to the current vVertexElement
1165 /// @param outputElt - reference to the current offset from simdvertex we're o
1166 /// @param compMask - component packing mask
1167 /// @param compCtrl - component control val
1168 /// @param vVertexElements[4] - vertex components to output
1169 /// @param swizzle[4] - component swizzle location
1170 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1171 {
1172 // Unpack tuple args
1173 Value*& vGatherResult = std::get<0>(args);
1174 Value* pVtxOut = std::get<1>(args);
1175 const Instruction::CastOps extendType = std::get<2>(args);
1176 const ConversionType conversionType = std::get<3>(args);
1177 uint32_t &currentVertexElement = std::get<4>(args);
1178 uint32_t &outputElt = std::get<5>(args);
1179 const ComponentEnable compMask = std::get<6>(args);
1180 const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1181 Value* (&vVertexElements)[4] = std::get<8>(args);
1182 const uint32_t (&swizzle)[4] = std::get<9>(args);
1183
1184 // cast types
1185 Type* vGatherTy = mSimdInt32Ty;
1186 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1187
1188 // have to do extra work for sign extending
1189 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1190 Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1191 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1192
1193 // shuffle mask, including any swizzling
1194 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1195 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1196 Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1197 char(y), char(y+4), char(y+8), char(y+12),
1198 char(z), char(z+4), char(z+8), char(z+12),
1199 char(w), char(w+4), char(w+8), char(w+12),
1200 char(x), char(x+4), char(x+8), char(x+12),
1201 char(y), char(y+4), char(y+8), char(y+12),
1202 char(z), char(z+4), char(z+8), char(z+12),
1203 char(w), char(w+4), char(w+8), char(w+12)});
1204
1205 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1206 // after pshufb: group components together in each 128bit lane
1207 // 256i - 0 1 2 3 4 5 6 7
1208 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1209
1210 Value* vi128XY = nullptr;
1211 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1212 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1213 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1214 // 256i - 0 1 2 3 4 5 6 7
1215 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1216 }
1217
1218 // do the same for zw components
1219 Value* vi128ZW = nullptr;
1220 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1221 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1222 }
1223
1224 // init denormalize variables if needed
1225 Instruction::CastOps fpCast;
1226 Value* conversionFactor;
1227
1228 switch (conversionType)
1229 {
1230 case CONVERT_NORMALIZED:
1231 fpCast = Instruction::CastOps::SIToFP;
1232 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1233 break;
1234 case CONVERT_SSCALED:
1235 fpCast = Instruction::CastOps::SIToFP;
1236 conversionFactor = VIMMED1((float)(1.0));
1237 break;
1238 case CONVERT_USCALED:
1239 SWR_ASSERT(0, "Type should not be sign extended!");
1240 conversionFactor = nullptr;
1241 break;
1242 default:
1243 SWR_ASSERT(conversionType == CONVERT_NONE);
1244 conversionFactor = nullptr;
1245 break;
1246 }
1247
1248 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1249 for (uint32_t i = 0; i < 4; i++)
1250 {
1251 if (isComponentEnabled(compMask, i))
1252 {
1253 if (compCtrl[i] == ComponentControl::StoreSrc)
1254 {
1255 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1256 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1257 // if x or y, use vi128XY permute result, else use vi128ZW
1258 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1259
1260 // sign extend
1261 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1262
1263 // denormalize if needed
1264 if (conversionType != CONVERT_NONE)
1265 {
1266 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1267 }
1268 currentVertexElement++;
1269 }
1270 else
1271 {
1272 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1273 }
1274
1275 if (currentVertexElement > 3)
1276 {
1277 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1278 // reset to the next vVertexElement to output
1279 currentVertexElement = 0;
1280 }
1281 }
1282 }
1283 }
1284 // else zero extend
1285 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1286 {
1287 // init denormalize variables if needed
1288 Instruction::CastOps fpCast;
1289 Value* conversionFactor;
1290
1291 switch (conversionType)
1292 {
1293 case CONVERT_NORMALIZED:
1294 fpCast = Instruction::CastOps::UIToFP;
1295 conversionFactor = VIMMED1((float)(1.0 / 255.0));
1296 break;
1297 case CONVERT_USCALED:
1298 fpCast = Instruction::CastOps::UIToFP;
1299 conversionFactor = VIMMED1((float)(1.0));
1300 break;
1301 case CONVERT_SSCALED:
1302 SWR_ASSERT(0, "Type should not be zero extended!");
1303 conversionFactor = nullptr;
1304 break;
1305 default:
1306 SWR_ASSERT(conversionType == CONVERT_NONE);
1307 conversionFactor = nullptr;
1308 break;
1309 }
1310
1311 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1312 for (uint32_t i = 0; i < 4; i++)
1313 {
1314 if (isComponentEnabled(compMask, i))
1315 {
1316 if (compCtrl[i] == ComponentControl::StoreSrc)
1317 {
1318 // pshufb masks for each component
1319 Value* vConstMask;
1320 switch (swizzle[i])
1321 {
1322 case 0:
1323 // x shuffle mask
1324 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1325 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1326 break;
1327 case 1:
1328 // y shuffle mask
1329 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1330 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1331 break;
1332 case 2:
1333 // z shuffle mask
1334 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1335 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1336 break;
1337 case 3:
1338 // w shuffle mask
1339 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1340 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1341 break;
1342 default:
1343 vConstMask = nullptr;
1344 break;
1345 }
1346
1347 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1348 // after pshufb for x channel
1349 // 256i - 0 1 2 3 4 5 6 7
1350 // x000 x000 x000 x000 x000 x000 x000 x000
1351
1352 // denormalize if needed
1353 if (conversionType != CONVERT_NONE)
1354 {
1355 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1356 }
1357 currentVertexElement++;
1358 }
1359 else
1360 {
1361 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1362 }
1363
1364 if (currentVertexElement > 3)
1365 {
1366 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1367 // reset to the next vVertexElement to output
1368 currentVertexElement = 0;
1369 }
1370 }
1371 }
1372 }
1373 else
1374 {
1375 SWR_ASSERT(0, "Unsupported conversion type");
1376 }
1377 }
1378
1379 //////////////////////////////////////////////////////////////////////////
1380 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1381 /// denormalizes if needed, converts to F32 if needed, and positions in
1382 // the proper SIMD rows to be output to the simdvertex structure
1383 /// @param args: (tuple of args, listed below)
1384 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1385 /// @param pVtxOut - base pointer to output simdvertex struct
1386 /// @param extendType - sign extend or zero extend
1387 /// @param bNormalized - do we need to denormalize?
1388 /// @param currentVertexElement - reference to the current vVertexElement
1389 /// @param outputElt - reference to the current offset from simdvertex we're o
1390 /// @param compMask - component packing mask
1391 /// @param compCtrl - component control val
1392 /// @param vVertexElements[4] - vertex components to output
1393 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1394 {
1395 // Unpack tuple args
1396 Value* (&vGatherResult)[2] = std::get<0>(args);
1397 Value* pVtxOut = std::get<1>(args);
1398 const Instruction::CastOps extendType = std::get<2>(args);
1399 const ConversionType conversionType = std::get<3>(args);
1400 uint32_t &currentVertexElement = std::get<4>(args);
1401 uint32_t &outputElt = std::get<5>(args);
1402 const ComponentEnable compMask = std::get<6>(args);
1403 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1404 Value* (&vVertexElements)[4] = std::get<8>(args);
1405
1406 // cast types
1407 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1408 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1409
1410 // have to do extra work for sign extending
1411 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1412 (extendType == Instruction::CastOps::FPExt))
1413 {
1414 // is this PP float?
1415 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1416
1417 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1418 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1419
1420 // shuffle mask
1421 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1422 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1423 Value* vi128XY = nullptr;
1424 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1425 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1426 // after pshufb: group components together in each 128bit lane
1427 // 256i - 0 1 2 3 4 5 6 7
1428 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1429
1430 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1431 // after PERMD: move and pack xy components into each 128bit lane
1432 // 256i - 0 1 2 3 4 5 6 7
1433 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1434 }
1435
1436 // do the same for zw components
1437 Value* vi128ZW = nullptr;
1438 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1439 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1440 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1441 }
1442
1443 // init denormalize variables if needed
1444 Instruction::CastOps IntToFpCast;
1445 Value* conversionFactor;
1446
1447 switch (conversionType)
1448 {
1449 case CONVERT_NORMALIZED:
1450 IntToFpCast = Instruction::CastOps::SIToFP;
1451 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1452 break;
1453 case CONVERT_SSCALED:
1454 IntToFpCast = Instruction::CastOps::SIToFP;
1455 conversionFactor = VIMMED1((float)(1.0));
1456 break;
1457 case CONVERT_USCALED:
1458 SWR_ASSERT(0, "Type should not be sign extended!");
1459 conversionFactor = nullptr;
1460 break;
1461 default:
1462 SWR_ASSERT(conversionType == CONVERT_NONE);
1463 conversionFactor = nullptr;
1464 break;
1465 }
1466
1467 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1468 for (uint32_t i = 0; i < 4; i++)
1469 {
1470 if (isComponentEnabled(compMask, i))
1471 {
1472 if (compCtrl[i] == ComponentControl::StoreSrc)
1473 {
1474 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1475 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1476 // if x or y, use vi128XY permute result, else use vi128ZW
1477 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1478
1479 if (bFP) {
1480 // extract 128 bit lanes to sign extend each component
1481 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1482 }
1483 else {
1484 // extract 128 bit lanes to sign extend each component
1485 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1486
1487 // denormalize if needed
1488 if (conversionType != CONVERT_NONE) {
1489 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1490 }
1491 }
1492 currentVertexElement++;
1493 }
1494 else
1495 {
1496 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1497 }
1498
1499 if (currentVertexElement > 3)
1500 {
1501 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1502 // reset to the next vVertexElement to output
1503 currentVertexElement = 0;
1504 }
1505 }
1506 }
1507 }
1508 // else zero extend
1509 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1510 {
1511 // pshufb masks for each component
1512 Value* vConstMask[2];
1513 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1514 // x/z shuffle mask
1515 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1516 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1517 }
1518
1519 if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1520 // y/w shuffle mask
1521 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1522 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1523 }
1524
1525 // init denormalize variables if needed
1526 Instruction::CastOps fpCast;
1527 Value* conversionFactor;
1528
1529 switch (conversionType)
1530 {
1531 case CONVERT_NORMALIZED:
1532 fpCast = Instruction::CastOps::UIToFP;
1533 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1534 break;
1535 case CONVERT_USCALED:
1536 fpCast = Instruction::CastOps::UIToFP;
1537 conversionFactor = VIMMED1((float)(1.0f));
1538 break;
1539 case CONVERT_SSCALED:
1540 SWR_ASSERT(0, "Type should not be zero extended!");
1541 conversionFactor = nullptr;
1542 break;
1543 default:
1544 SWR_ASSERT(conversionType == CONVERT_NONE);
1545 conversionFactor = nullptr;
1546 break;
1547 }
1548
1549 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1550 for (uint32_t i = 0; i < 4; i++)
1551 {
1552 if (isComponentEnabled(compMask, i))
1553 {
1554 if (compCtrl[i] == ComponentControl::StoreSrc)
1555 {
1556 // select correct constMask for x/z or y/w pshufb
1557 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1558 // if x or y, use vi128XY permute result, else use vi128ZW
1559 uint32_t selectedGather = (i < 2) ? 0 : 1;
1560
1561 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1562 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1563 // 256i - 0 1 2 3 4 5 6 7
1564 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1565
1566 // denormalize if needed
1567 if (conversionType != CONVERT_NONE)
1568 {
1569 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1570 }
1571 currentVertexElement++;
1572 }
1573 else
1574 {
1575 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1576 }
1577
1578 if (currentVertexElement > 3)
1579 {
1580 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1581 // reset to the next vVertexElement to output
1582 currentVertexElement = 0;
1583 }
1584 }
1585 }
1586 }
1587 else
1588 {
1589 SWR_ASSERT(0, "Unsupported conversion type");
1590 }
1591 }
1592
1593 //////////////////////////////////////////////////////////////////////////
1594 /// @brief Output a simdvertex worth of elements to the current outputElt
1595 /// @param pVtxOut - base address of VIN output struct
1596 /// @param outputElt - simdvertex offset in VIN to write to
1597 /// @param numEltsToStore - number of simdvertex rows to write out
1598 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1599 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1600 {
1601 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1602
1603 for(uint32_t c = 0; c < numEltsToStore; ++c)
1604 {
1605 // STORE expects FP32 x vWidth type, just bitcast if needed
1606 if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1607 #if FETCH_DUMP_VERTEX
1608 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1609 #endif
1610 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1611 }
1612 #if FETCH_DUMP_VERTEX
1613 else
1614 {
1615 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1616 }
1617 #endif
1618 // outputElt * 4 = offsetting by the size of a simdvertex
1619 // + c offsets to a 32bit x vWidth row within the current vertex
1620 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1621 STORE(vVertexElements[c], dest);
1622 }
1623 }
1624
1625 //////////////////////////////////////////////////////////////////////////
1626 /// @brief Generates a constant vector of values based on the
1627 /// ComponentControl value
1628 /// @param ctrl - ComponentControl value
1629 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1630 {
1631 switch(ctrl)
1632 {
1633 case NoStore: return VUNDEF_I();
1634 case Store0: return VIMMED1(0);
1635 case Store1Fp: return VIMMED1(1.0f);
1636 case Store1Int: return VIMMED1(1);
1637 case StoreVertexId:
1638 {
1639 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1640 return VBROADCAST(pId);
1641 }
1642 case StoreInstanceId:
1643 {
1644 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1645 return VBROADCAST(pId);
1646 }
1647 case StoreSrc:
1648 default: SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1649 }
1650 }
1651
1652 //////////////////////////////////////////////////////////////////////////
1653 /// @brief Returns the enable mask for the specified component.
1654 /// @param enableMask - enable bits
1655 /// @param component - component to check if enabled.
1656 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1657 {
1658 switch (component)
1659 {
1660 // X
1661 case 0: return (enableMask & ComponentEnable::X);
1662 // Y
1663 case 1: return (enableMask & ComponentEnable::Y);
1664 // Z
1665 case 2: return (enableMask & ComponentEnable::Z);
1666 // W
1667 case 3: return (enableMask & ComponentEnable::W);
1668
1669 default: return false;
1670 }
1671 }
1672
1673
1674 //////////////////////////////////////////////////////////////////////////
1675 /// @brief JITs from fetch shader IR
1676 /// @param hJitMgr - JitManager handle
1677 /// @param func - LLVM function IR
1678 /// @return PFN_FETCH_FUNC - pointer to fetch code
1679 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1680 {
1681 const llvm::Function* func = (const llvm::Function*)hFunc;
1682 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1683 PFN_FETCH_FUNC pfnFetch;
1684
1685 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1686 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1687 pJitMgr->mIsModuleFinalized = true;
1688
1689 #if defined(KNOB_SWRC_TRACING)
1690 char fName[1024];
1691 const char *funcName = func->getName().data();
1692 sprintf(fName, "%s.bin", funcName);
1693 FILE *fd = fopen(fName, "wb");
1694 fwrite((void *)pfnFetch, 1, 2048, fd);
1695 fclose(fd);
1696 #endif
1697
1698 return pfnFetch;
1699 }
1700
1701 //////////////////////////////////////////////////////////////////////////
1702 /// @brief JIT compiles fetch shader
1703 /// @param hJitMgr - JitManager handle
1704 /// @param state - fetch state to build function from
1705 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1706 {
1707 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1708
1709 pJitMgr->SetupNewModule();
1710
1711 FetchJit theJit(pJitMgr);
1712 HANDLE hFunc = theJit.Create(state);
1713
1714 return JitFetchFunc(hJitMgr, hFunc);
1715 }