59ae43ac64694a9b93239b7378647e339c836e62
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_api.h"
31 #include "fetch_jit.h"
32 #include "builder.h"
33 #include "state_llvm.h"
34 #include <sstream>
35 #include <tuple>
36
37 //#define FETCH_DUMP_VERTEX 1
38
39 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
40
41 enum ConversionType
42 {
43 CONVERT_NONE,
44 CONVERT_NORMALIZED,
45 CONVERT_USCALED,
46 CONVERT_SSCALED,
47 };
48
49 //////////////////////////////////////////////////////////////////////////
50 /// Interface to Jitting a fetch shader
51 //////////////////////////////////////////////////////////////////////////
52 struct FetchJit : public Builder
53 {
54 FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
55
56 Function* Create(const FETCH_COMPILE_STATE& fetchState);
57 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
58 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
59 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
60
61 // package up Shuffle*bpcGatherd args into a tuple for convenience
62 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
63 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
64 const uint32_t(&)[4]> Shuffle8bpcArgs;
65 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
66
67 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
68 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
69 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
70
71 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
72
73 Value* GenerateCompCtrlVector(const ComponentControl ctrl);
74
75 void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
76 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
77
78 bool IsOddFormat(SWR_FORMAT format);
79 bool IsUniformFormat(SWR_FORMAT format);
80 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
81 void CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4]);
82 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
83
84 Value* mpFetchInfo;
85 };
86
87 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
88 {
89 static std::size_t fetchNum = 0;
90
91 std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
92 fnName << fetchNum++;
93
94 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
95 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
96
97 IRB()->SetInsertPoint(entry);
98
99 auto argitr = fetch->getArgumentList().begin();
100
101 // Fetch shader arguments
102 mpFetchInfo = &*argitr; ++argitr;
103 mpFetchInfo->setName("fetchInfo");
104 Value* pVtxOut = &*argitr;
105 pVtxOut->setName("vtxOutput");
106 // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
107 // index 0(just the pointer to the simdvertex structure
108 // index 1(which element of the simdvertex structure to offset to(in this case 0)
109 // so the indices being i32's doesn't matter
110 // TODO: generated this GEP with a VECTOR structure type so this makes sense
111 std::vector<Value*> vtxInputIndices(2, C(0));
112 // GEP
113 pVtxOut = GEP(pVtxOut, C(0));
114 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
115
116 // SWR_FETCH_CONTEXT::pStreams
117 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
118 streams->setName("pStreams");
119
120 // SWR_FETCH_CONTEXT::pIndices
121 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
122 indices->setName("pIndices");
123
124 // SWR_FETCH_CONTEXT::pLastIndex
125 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
126 pLastIndex->setName("pLastIndex");
127
128
129 Value* vIndices;
130 switch(fetchState.indexType)
131 {
132 case R8_UINT:
133 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
134 if(fetchState.bDisableIndexOOBCheck){
135 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
136 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
137 }
138 else{
139 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
140 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
141 }
142 break;
143 case R16_UINT:
144 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
145 if(fetchState.bDisableIndexOOBCheck){
146 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
147 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
148 }
149 else{
150 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
151 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
152 }
153 break;
154 case R32_UINT:
155 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
156 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
157 break; // incoming type is already 32bit int
158 default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
159 }
160
161 // store out vertex IDs
162 STORE(vIndices, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
163
164 // store out cut mask if enabled
165 if (fetchState.bEnableCutIndex)
166 {
167 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
168 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
169 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
170 }
171
172 // Fetch attributes from memory and output to a simdvertex struct
173 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
174 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
175 : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
176
177 RET_VOID();
178
179 JitManager::DumpToFile(fetch, "src");
180
181 #if defined(_DEBUG)
182 verifyFunction(*fetch);
183 #endif
184
185 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
186
187 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
188 setupPasses.add(createBreakCriticalEdgesPass());
189 setupPasses.add(createCFGSimplificationPass());
190 setupPasses.add(createEarlyCSEPass());
191 setupPasses.add(createPromoteMemoryToRegisterPass());
192
193 setupPasses.run(*fetch);
194
195 JitManager::DumpToFile(fetch, "se");
196
197 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
198
199 ///@todo Haven't touched these either. Need to remove some of these and add others.
200 optPasses.add(createCFGSimplificationPass());
201 optPasses.add(createEarlyCSEPass());
202 optPasses.add(createInstructionCombiningPass());
203 optPasses.add(createInstructionSimplifierPass());
204 optPasses.add(createConstantPropagationPass());
205 optPasses.add(createSCCPPass());
206 optPasses.add(createAggressiveDCEPass());
207
208 optPasses.run(*fetch);
209 optPasses.run(*fetch);
210
211 JitManager::DumpToFile(fetch, "opt");
212
213 return fetch;
214 }
215
216 //////////////////////////////////////////////////////////////////////////
217 /// @brief Loads attributes from memory using LOADs, shuffling the
218 /// components into SOA form.
219 /// *Note* currently does not support component control,
220 /// component packing, instancing
221 /// @param fetchState - info about attributes to be fetched from memory
222 /// @param streams - value pointer to the current vertex stream
223 /// @param vIndices - vector value of indices to load
224 /// @param pVtxOut - value pointer to output simdvertex struct
225 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
226 {
227 // Zack shuffles; a variant of the Charleston.
228
229 std::vector<Value*> vectors(16);
230 std::vector<Constant*> pMask(mVWidth);
231 for(uint32_t i = 0; i < mVWidth; ++i)
232 {
233 pMask[i] = (C(i < 4 ? i : 4));
234 }
235 Constant* promoteMask = ConstantVector::get(pMask);
236 Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
237
238 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
239 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
240 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
241 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
242 curInstance->setName("curInstance");
243
244 for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
245 {
246 Value* elements[4] = {0};
247 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
248 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
249 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
250 uint32_t numComponents = info.numComps;
251 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
252
253 // load path doesn't support component packing
254 SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
255
256 vectors.clear();
257
258 Value *vCurIndices;
259 Value *startOffset;
260 if(ied.InstanceEnable)
261 {
262 Value* stepRate = C(ied.InstanceDataStepRate);
263
264 // prevent a div by 0 for 0 step rate
265 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
266 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
267
268 // calc the current offset into instanced data buffer
269 Value* calcInstance = UDIV(curInstance, stepRate);
270
271 // if step rate is 0, every instance gets instance 0
272 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
273
274 vCurIndices = VBROADCAST(calcInstance);
275
276 startOffset = startInstance;
277 }
278 else
279 {
280 // offset indices by baseVertex
281 vCurIndices = ADD(vIndices, vBaseVertex);
282
283 startOffset = startVertex;
284 }
285
286 // load SWR_VERTEX_BUFFER_STATE::pData
287 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
288
289 // load SWR_VERTEX_BUFFER_STATE::pitch
290 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
291 stride = Z_EXT(stride, mInt64Ty);
292
293 // load SWR_VERTEX_BUFFER_STATE::size
294 Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
295 size = Z_EXT(size, mInt64Ty);
296
297 Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
298
299 // Load from the stream.
300 for(uint32_t lane = 0; lane < mVWidth; ++lane)
301 {
302 // Get index
303 Value* index = VEXTRACT(vCurIndices, C(lane));
304 index = Z_EXT(index, mInt64Ty);
305
306 Value* offset = MUL(index, stride);
307 offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
308 offset = ADD(offset, startVertexOffset);
309
310 if (!fetchState.bDisableIndexOOBCheck) {
311 // check for out of bound access, including partial OOB, and mask them to 0
312 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
313 Value *oob = ICMP_ULE(endOffset, size);
314 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
315 }
316
317 Value* pointer = GEP(stream, offset);
318 // We use a full-lane, but don't actually care.
319 Value* vptr = 0;
320
321 // get a pointer to a 4 component attrib in default address space
322 switch(bpc)
323 {
324 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
325 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
326 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
327 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
328 }
329
330 // load 4 components of attribute
331 Value* vec = ALIGNED_LOAD(vptr, 1, false);
332
333 // Convert To FP32 internally
334 switch(info.type[0])
335 {
336 case SWR_TYPE_UNORM:
337 switch(bpc)
338 {
339 case 8:
340 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
341 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
342 break;
343 case 16:
344 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
345 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
346 break;
347 default:
348 SWR_ASSERT(false, "Unsupported underlying type!");
349 break;
350 }
351 break;
352 case SWR_TYPE_SNORM:
353 switch(bpc)
354 {
355 case 8:
356 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
357 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
358 break;
359 case 16:
360 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
361 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
362 break;
363 default:
364 SWR_ASSERT(false, "Unsupported underlying type!");
365 break;
366 }
367 break;
368 case SWR_TYPE_UINT:
369 // Zero extend uint32_t types.
370 switch(bpc)
371 {
372 case 8:
373 case 16:
374 vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
375 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
376 break;
377 case 32:
378 break; // Pass through unchanged.
379 default:
380 SWR_ASSERT(false, "Unsupported underlying type!");
381 break;
382 }
383 break;
384 case SWR_TYPE_SINT:
385 // Sign extend SINT types.
386 switch(bpc)
387 {
388 case 8:
389 case 16:
390 vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
391 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
392 break;
393 case 32:
394 break; // Pass through unchanged.
395 default:
396 SWR_ASSERT(false, "Unsupported underlying type!");
397 break;
398 }
399 break;
400 case SWR_TYPE_FLOAT:
401 switch(bpc)
402 {
403 case 32:
404 break; // Pass through unchanged.
405 default:
406 SWR_ASSERT(false, "Unsupported underlying type!");
407 }
408 break;
409 case SWR_TYPE_USCALED:
410 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
411 break;
412 case SWR_TYPE_SSCALED:
413 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
414 break;
415 case SWR_TYPE_UNKNOWN:
416 case SWR_TYPE_UNUSED:
417 SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
418 }
419
420 // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
421 // uwvec: 4 x F32, undef value
422 Value* wvec = VSHUFFLE(vec, uwvec, promoteMask);
423 vectors.push_back(wvec);
424 }
425
426 std::vector<Constant*> v01Mask(mVWidth);
427 std::vector<Constant*> v23Mask(mVWidth);
428 std::vector<Constant*> v02Mask(mVWidth);
429 std::vector<Constant*> v13Mask(mVWidth);
430
431 // Concatenate the vectors together.
432 elements[0] = VUNDEF_F();
433 elements[1] = VUNDEF_F();
434 elements[2] = VUNDEF_F();
435 elements[3] = VUNDEF_F();
436 for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
437 {
438 v01Mask[4 * b + 0] = C(0 + 4 * b);
439 v01Mask[4 * b + 1] = C(1 + 4 * b);
440 v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
441 v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
442
443 v23Mask[4 * b + 0] = C(2 + 4 * b);
444 v23Mask[4 * b + 1] = C(3 + 4 * b);
445 v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
446 v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
447
448 v02Mask[4 * b + 0] = C(0 + 4 * b);
449 v02Mask[4 * b + 1] = C(2 + 4 * b);
450 v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
451 v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
452
453 v13Mask[4 * b + 0] = C(1 + 4 * b);
454 v13Mask[4 * b + 1] = C(3 + 4 * b);
455 v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
456 v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
457
458 std::vector<Constant*> iMask(mVWidth);
459 for(uint32_t i = 0; i < mVWidth; ++i)
460 {
461 if(((4 * b) <= i) && (i < (4 * (b + 1))))
462 {
463 iMask[i] = C(i % 4 + mVWidth);
464 }
465 else
466 {
467 iMask[i] = C(i);
468 }
469 }
470 Constant* insertMask = ConstantVector::get(iMask);
471 elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
472 elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
473 elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
474 elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
475 }
476
477 Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
478 Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
479 Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
480 Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
481 elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
482 elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
483 elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
484 elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
485
486 switch(numComponents + 1)
487 {
488 case 1: elements[0] = VIMMED1(0.0f);
489 case 2: elements[1] = VIMMED1(0.0f);
490 case 3: elements[2] = VIMMED1(0.0f);
491 case 4: elements[3] = VIMMED1(1.0f);
492 }
493
494 for(uint32_t c = 0; c < 4; ++c)
495 {
496 Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
497 STORE(elements[c], dest);
498 }
499 }
500 }
501
502 // returns true for odd formats that require special state.gather handling
503 bool FetchJit::IsOddFormat(SWR_FORMAT format)
504 {
505 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
506 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
507 {
508 return true;
509 }
510 return false;
511 }
512
513 // format is uniform if all components are the same size and type
514 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
515 {
516 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
517 uint32_t bpc0 = info.bpc[0];
518 uint32_t type0 = info.type[0];
519
520 for (uint32_t c = 1; c < info.numComps; ++c)
521 {
522 if (bpc0 != info.bpc[c] || type0 != info.type[c])
523 {
524 return false;
525 }
526 }
527 return true;
528 }
529
530 // unpacks components based on format
531 // foreach component in the pixel
532 // mask off everything but this component
533 // shift component to LSB
534 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
535 {
536 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
537
538 uint32_t bitOffset = 0;
539 for (uint32_t c = 0; c < info.numComps; ++c)
540 {
541 uint32_t swizzledIndex = info.swizzle[c];
542 uint32_t compBits = info.bpc[c];
543 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
544 Value* comp = AND(vInput, bitmask);
545 comp = LSHR(comp, bitOffset);
546
547 result[swizzledIndex] = comp;
548 bitOffset += compBits;
549 }
550 }
551
552 // gather for odd component size formats
553 // gather SIMD full pixels per lane then shift/mask to move each component to their
554 // own vector
555 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4])
556 {
557 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
558
559 // only works if pixel size is <= 32bits
560 SWR_ASSERT(info.bpp <= 32);
561
562 Value* gather = VUNDEF_I();
563
564 // assign defaults
565 for (uint32_t comp = 0; comp < 4; ++comp)
566 {
567 result[comp] = VIMMED1((int)info.defaults[comp]);
568 }
569
570 // gather SIMD pixels
571 for (uint32_t e = 0; e < JM()->mVWidth; ++e)
572 {
573 Value* elemOffset = VEXTRACT(offsets, C(e));
574 Value* load = GEP(pBase, elemOffset);
575
576 // load the proper amount of data based on component size
577 switch (info.bpp)
578 {
579 case 8: load = POINTER_CAST(load, Type::getInt8PtrTy(JM()->mContext)); break;
580 case 16: load = POINTER_CAST(load, Type::getInt16PtrTy(JM()->mContext)); break;
581 case 32: load = POINTER_CAST(load, Type::getInt32PtrTy(JM()->mContext)); break;
582 default: SWR_ASSERT(0);
583 }
584
585 // load pixel
586 Value *val = LOAD(load);
587
588 // zero extend to 32bit integer
589 val = INT_CAST(val, mInt32Ty, false);
590
591 // store in simd lane
592 gather = VINSERT(gather, val, C(e));
593 }
594
595 UnpackComponents(format, gather, result);
596
597 // cast to fp32
598 result[0] = BITCAST(result[0], mSimdFP32Ty);
599 result[1] = BITCAST(result[1], mSimdFP32Ty);
600 result[2] = BITCAST(result[2], mSimdFP32Ty);
601 result[3] = BITCAST(result[3], mSimdFP32Ty);
602 }
603
604 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
605 {
606 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
607
608 for (uint32_t c = 0; c < info.numComps; ++c)
609 {
610 uint32_t compIndex = info.swizzle[c];
611
612 // skip any conversion on UNUSED components
613 if (info.type[c] == SWR_TYPE_UNUSED)
614 {
615 continue;
616 }
617
618 if (info.isNormalized[c])
619 {
620 if (info.type[c] == SWR_TYPE_SNORM)
621 {
622 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
623
624 /// result = c * (1.0f / (2^(n-1) - 1);
625 uint32_t n = info.bpc[c];
626 uint32_t pow2 = 1 << (n - 1);
627 float scale = 1.0f / (float)(pow2 - 1);
628 Value *vScale = VIMMED1(scale);
629 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
630 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
631 texels[compIndex] = FMUL(texels[compIndex], vScale);
632 }
633 else
634 {
635 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
636
637 /// result = c * (1.0f / (2^n - 1))
638 uint32_t n = info.bpc[c];
639 uint32_t pow2 = 1 << n;
640 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
641 if (n == 24)
642 {
643 float scale = (float)(pow2 - 1);
644 Value* vScale = VIMMED1(scale);
645 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
646 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
647 texels[compIndex] = FDIV(texels[compIndex], vScale);
648 }
649 else
650 {
651 float scale = 1.0f / (float)(pow2 - 1);
652 Value *vScale = VIMMED1(scale);
653 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
654 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
655 texels[compIndex] = FMUL(texels[compIndex], vScale);
656 }
657 }
658 continue;
659 }
660 }
661 }
662
663 //////////////////////////////////////////////////////////////////////////
664 /// @brief Loads attributes from memory using AVX2 GATHER(s)
665 /// @param fetchState - info about attributes to be fetched from memory
666 /// @param streams - value pointer to the current vertex stream
667 /// @param vIndices - vector value of indices to gather
668 /// @param pVtxOut - value pointer to output simdvertex struct
669 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
670 Value* streams, Value* vIndices, Value* pVtxOut)
671 {
672 uint32_t currentVertexElement = 0;
673 uint32_t outputElt = 0;
674 Value* vVertexElements[4];
675
676 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
677 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
678 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
679 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
680 curInstance->setName("curInstance");
681
682 for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
683 {
684 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
685
686 // skip element if all components are disabled
687 if (ied.ComponentPacking == ComponentEnable::NONE)
688 {
689 continue;
690 }
691
692 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
693 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
694 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
695
696 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
697
698 // VGATHER* takes an *i8 src pointer
699 Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
700
701 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
702 Value *vStride = VBROADCAST(stride);
703
704 // max vertex index that is fully in bounds
705 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
706 maxVertex = LOAD(maxVertex);
707
708 Value *vCurIndices;
709 Value *startOffset;
710 if(ied.InstanceEnable)
711 {
712 Value* stepRate = C(ied.InstanceDataStepRate);
713
714 // prevent a div by 0 for 0 step rate
715 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
716 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
717
718 // calc the current offset into instanced data buffer
719 Value* calcInstance = UDIV(curInstance, stepRate);
720
721 // if step rate is 0, every instance gets instance 0
722 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
723
724 vCurIndices = VBROADCAST(calcInstance);
725
726 startOffset = startInstance;
727 }
728 else
729 {
730 // offset indices by baseVertex
731 vCurIndices = ADD(vIndices, vBaseVertex);
732
733 startOffset = startVertex;
734 }
735
736 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
737 // do 64bit address offset calculations.
738
739 // calculate byte offset to the start of the VB
740 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
741 pStreamBase = GEP(pStreamBase, baseOffset);
742
743 // if we have a start offset, subtract from max vertex. Used for OOB check
744 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
745 Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
746 // if we have a negative value, we're already OOB. clamp at 0.
747 maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
748
749 // Load the in bounds size of a partially valid vertex
750 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
751 partialInboundsSize = LOAD(partialInboundsSize);
752 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
753 Value* vBpp = VBROADCAST(C(info.Bpp));
754 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
755
756 // is the element is <= the partially valid size
757 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
758
759 // override cur indices with 0 if pitch is 0
760 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
761 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
762
763 // are vertices partially OOB?
764 Value* vMaxVertex = VBROADCAST(maxVertex);
765 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
766
767 // are vertices are fully in bounds?
768 Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
769
770 // blend in any partially OOB indices that have valid elements
771 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
772 vGatherMask = VMASK(vGatherMask);
773
774 // calculate the actual offsets into the VB
775 Value* vOffsets = MUL(vCurIndices, vStride);
776 vOffsets = ADD(vOffsets, vAlignmentOffsets);
777
778 // Packing and component control
779 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
780 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
781 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
782
783 // Special gather/conversion for formats without equal component sizes
784 if (IsOddFormat((SWR_FORMAT)ied.Format))
785 {
786 Value* pResults[4];
787 CreateGatherOddFormats((SWR_FORMAT)ied.Format, pStreamBase, vOffsets, pResults);
788 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
789
790 for (uint32_t c = 0; c < 4; ++c)
791 {
792 if (isComponentEnabled(compMask, c))
793 {
794 vVertexElements[currentVertexElement++] = pResults[c];
795 if (currentVertexElement > 3)
796 {
797 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
798 // reset to the next vVertexElement to output
799 currentVertexElement = 0;
800 }
801 }
802 }
803 }
804 else if(info.type[0] == SWR_TYPE_FLOAT)
805 {
806 ///@todo: support 64 bit vb accesses
807 Value* gatherSrc = VIMMED1(0.0f);
808
809 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
810 "Unsupported format for standard gather fetch.");
811
812 // Gather components from memory to store in a simdvertex structure
813 switch(bpc)
814 {
815 case 16:
816 {
817 Value* vGatherResult[2];
818 Value *vMask;
819
820 // if we have at least one component out of x or y to fetch
821 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
822 // save mask as it is zero'd out after each gather
823 vMask = vGatherMask;
824
825 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
826 // e.g. result of first 8x32bit integer gather for 16bit components
827 // 256i - 0 1 2 3 4 5 6 7
828 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
829 //
830 }
831
832 // if we have at least one component out of z or w to fetch
833 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
834 // offset base to the next components(zw) in the vertex to gather
835 pStreamBase = GEP(pStreamBase, C((char)4));
836 vMask = vGatherMask;
837
838 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
839 // e.g. result of second 8x32bit integer gather for 16bit components
840 // 256i - 0 1 2 3 4 5 6 7
841 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
842 //
843 }
844
845 // if we have at least one component to shuffle into place
846 if(compMask){
847 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
848 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
849
850 // Shuffle gathered components into place in simdvertex struct
851 Shuffle16bpcGather(args); // outputs to vVertexElements ref
852 }
853 }
854 break;
855 case 32:
856 {
857 for (uint32_t i = 0; i < 4; i++)
858 {
859 if (isComponentEnabled(compMask, i))
860 {
861 // if we need to gather the component
862 if (compCtrl[i] == StoreSrc)
863 {
864 // save mask as it is zero'd out after each gather
865 Value *vMask = vGatherMask;
866
867 // Gather a SIMD of vertices
868 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
869 }
870 else
871 {
872 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
873 }
874
875 if (currentVertexElement > 3)
876 {
877 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
878 // reset to the next vVertexElement to output
879 currentVertexElement = 0;
880 }
881
882 }
883
884 // offset base to the next component in the vertex to gather
885 pStreamBase = GEP(pStreamBase, C((char)4));
886 }
887 }
888 break;
889 default:
890 SWR_ASSERT(0, "Tried to fetch invalid FP format");
891 break;
892 }
893 }
894 else
895 {
896 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
897 ConversionType conversionType = CONVERT_NONE;
898
899 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
900 "Unsupported format for standard gather fetch.");
901
902 switch(info.type[0])
903 {
904 case SWR_TYPE_UNORM:
905 conversionType = CONVERT_NORMALIZED;
906 case SWR_TYPE_UINT:
907 extendCastType = Instruction::CastOps::ZExt;
908 break;
909 case SWR_TYPE_SNORM:
910 conversionType = CONVERT_NORMALIZED;
911 case SWR_TYPE_SINT:
912 extendCastType = Instruction::CastOps::SExt;
913 break;
914 case SWR_TYPE_USCALED:
915 conversionType = CONVERT_USCALED;
916 extendCastType = Instruction::CastOps::UIToFP;
917 break;
918 case SWR_TYPE_SSCALED:
919 conversionType = CONVERT_SSCALED;
920 extendCastType = Instruction::CastOps::SIToFP;
921 break;
922 default:
923 break;
924 }
925
926 // value substituted when component of gather is masked
927 Value* gatherSrc = VIMMED1(0);
928
929 // Gather components from memory to store in a simdvertex structure
930 switch (bpc)
931 {
932 case 8:
933 {
934 // if we have at least one component to fetch
935 if(compMask)
936 {
937 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
938 // e.g. result of an 8x32bit integer gather for 8bit components
939 // 256i - 0 1 2 3 4 5 6 7
940 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
941
942 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
943 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
944
945 // Shuffle gathered components into place in simdvertex struct
946 Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
947 }
948 }
949 break;
950 case 16:
951 {
952 Value* vGatherResult[2];
953 Value *vMask;
954
955 // if we have at least one component out of x or y to fetch
956 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
957 // save mask as it is zero'd out after each gather
958 vMask = vGatherMask;
959
960 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
961 // e.g. result of first 8x32bit integer gather for 16bit components
962 // 256i - 0 1 2 3 4 5 6 7
963 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
964 //
965 }
966
967 // if we have at least one component out of z or w to fetch
968 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
969 // offset base to the next components(zw) in the vertex to gather
970 pStreamBase = GEP(pStreamBase, C((char)4));
971 vMask = vGatherMask;
972
973 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
974 // e.g. result of second 8x32bit integer gather for 16bit components
975 // 256i - 0 1 2 3 4 5 6 7
976 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
977 //
978 }
979
980 // if we have at least one component to shuffle into place
981 if(compMask){
982 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
983 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
984
985 // Shuffle gathered components into place in simdvertex struct
986 Shuffle16bpcGather(args); // outputs to vVertexElements ref
987 }
988 }
989 break;
990 case 32:
991 {
992 // Gathered components into place in simdvertex struct
993 for (uint32_t i = 0; i < 4; i++)
994 {
995 if (isComponentEnabled(compMask, i))
996 {
997 // if we need to gather the component
998 if (compCtrl[i] == StoreSrc)
999 {
1000 // save mask as it is zero'd out after each gather
1001 Value *vMask = vGatherMask;
1002
1003 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
1004
1005 if (conversionType == CONVERT_USCALED)
1006 {
1007 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1008 }
1009 else if (conversionType == CONVERT_SSCALED)
1010 {
1011 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1012 }
1013
1014 vVertexElements[currentVertexElement++] = pGather;
1015 // e.g. result of a single 8x32bit integer gather for 32bit components
1016 // 256i - 0 1 2 3 4 5 6 7
1017 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1018 }
1019 else
1020 {
1021 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1022 }
1023
1024 if (currentVertexElement > 3)
1025 {
1026 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1027 // reset to the next vVertexElement to output
1028 currentVertexElement = 0;
1029 }
1030
1031 }
1032
1033 // offset base to the next component in the vertex to gather
1034 pStreamBase = GEP(pStreamBase, C((char)4));
1035 }
1036 }
1037 break;
1038 }
1039 }
1040 }
1041
1042 // if we have a partially filled vVertexElement struct, output it
1043 if(currentVertexElement > 0){
1044 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1045 }
1046 }
1047
1048 //////////////////////////////////////////////////////////////////////////
1049 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1050 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1051 /// support
1052 /// @param pIndices - pointer to 8 bit indices
1053 /// @param pLastIndex - pointer to last valid index
1054 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1055 {
1056 // can fit 2 16 bit integers per vWidth lane
1057 Value* vIndices = VUNDEF_I();
1058
1059 // store 0 index on stack to be used to conditionally load from if index address is OOB
1060 Value* pZeroIndex = ALLOCA(mInt8Ty);
1061 STORE(C((uint8_t)0), pZeroIndex);
1062
1063 // Load a SIMD of index pointers
1064 for(int64_t lane = 0; lane < mVWidth; lane++)
1065 {
1066 // Calculate the address of the requested index
1067 Value *pIndex = GEP(pIndices, C(lane));
1068
1069 // check if the address is less than the max index,
1070 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1071
1072 // if valid, load the index. if not, load 0 from the stack
1073 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1074 Value *index = LOAD(pValid, "valid index");
1075
1076 // zero extended index to 32 bits and insert into the correct simd lane
1077 index = Z_EXT(index, mInt32Ty);
1078 vIndices = VINSERT(vIndices, index, lane);
1079 }
1080 return vIndices;
1081 }
1082
1083 //////////////////////////////////////////////////////////////////////////
1084 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1085 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1086 /// support
1087 /// @param pIndices - pointer to 16 bit indices
1088 /// @param pLastIndex - pointer to last valid index
1089 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1090 {
1091 // can fit 2 16 bit integers per vWidth lane
1092 Value* vIndices = VUNDEF_I();
1093
1094 // store 0 index on stack to be used to conditionally load from if index address is OOB
1095 Value* pZeroIndex = ALLOCA(mInt16Ty);
1096 STORE(C((uint16_t)0), pZeroIndex);
1097
1098 // Load a SIMD of index pointers
1099 for(int64_t lane = 0; lane < mVWidth; lane++)
1100 {
1101 // Calculate the address of the requested index
1102 Value *pIndex = GEP(pIndices, C(lane));
1103
1104 // check if the address is less than the max index,
1105 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1106
1107 // if valid, load the index. if not, load 0 from the stack
1108 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1109 Value *index = LOAD(pValid, "valid index");
1110
1111 // zero extended index to 32 bits and insert into the correct simd lane
1112 index = Z_EXT(index, mInt32Ty);
1113 vIndices = VINSERT(vIndices, index, lane);
1114 }
1115 return vIndices;
1116 }
1117
1118 //////////////////////////////////////////////////////////////////////////
1119 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1120 /// @param pIndices - pointer to 32 bit indices
1121 /// @param pLastIndex - pointer to last valid index
1122 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1123 {
1124 DataLayout dL(JM()->mpCurrentModule);
1125 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1126 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1127 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1128
1129 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1130 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1131 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1132 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1133
1134 // create a vector of index counts from the base index ptr passed into the fetch
1135 const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1136 Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1137
1138 // compare index count to the max valid index
1139 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1140 // vIndexOffsets 0 1 2 3 4 5 6 7
1141 // ------------------------------
1142 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1143 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1144 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1145 Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1146
1147 // VMASKLOAD takes an *i8 src pointer
1148 pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1149
1150 // Load the indices; OOB loads 0
1151 return MASKLOADD(pIndices,vIndexMask);
1152 }
1153
1154 //////////////////////////////////////////////////////////////////////////
1155 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1156 /// denormalizes if needed, converts to F32 if needed, and positions in
1157 // the proper SIMD rows to be output to the simdvertex structure
1158 /// @param args: (tuple of args, listed below)
1159 /// @param vGatherResult - 8 gathered 8bpc vertices
1160 /// @param pVtxOut - base pointer to output simdvertex struct
1161 /// @param extendType - sign extend or zero extend
1162 /// @param bNormalized - do we need to denormalize?
1163 /// @param currentVertexElement - reference to the current vVertexElement
1164 /// @param outputElt - reference to the current offset from simdvertex we're o
1165 /// @param compMask - component packing mask
1166 /// @param compCtrl - component control val
1167 /// @param vVertexElements[4] - vertex components to output
1168 /// @param swizzle[4] - component swizzle location
1169 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1170 {
1171 // Unpack tuple args
1172 Value*& vGatherResult = std::get<0>(args);
1173 Value* pVtxOut = std::get<1>(args);
1174 const Instruction::CastOps extendType = std::get<2>(args);
1175 const ConversionType conversionType = std::get<3>(args);
1176 uint32_t &currentVertexElement = std::get<4>(args);
1177 uint32_t &outputElt = std::get<5>(args);
1178 const ComponentEnable compMask = std::get<6>(args);
1179 const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1180 Value* (&vVertexElements)[4] = std::get<8>(args);
1181 const uint32_t (&swizzle)[4] = std::get<9>(args);
1182
1183 // cast types
1184 Type* vGatherTy = mSimdInt32Ty;
1185 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1186
1187 // have to do extra work for sign extending
1188 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1189 Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1190 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1191
1192 // shuffle mask, including any swizzling
1193 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1194 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1195 Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1196 char(y), char(y+4), char(y+8), char(y+12),
1197 char(z), char(z+4), char(z+8), char(z+12),
1198 char(w), char(w+4), char(w+8), char(w+12),
1199 char(x), char(x+4), char(x+8), char(x+12),
1200 char(y), char(y+4), char(y+8), char(y+12),
1201 char(z), char(z+4), char(z+8), char(z+12),
1202 char(w), char(w+4), char(w+8), char(w+12)});
1203
1204 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1205 // after pshufb: group components together in each 128bit lane
1206 // 256i - 0 1 2 3 4 5 6 7
1207 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1208
1209 Value* vi128XY = nullptr;
1210 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1211 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1212 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1213 // 256i - 0 1 2 3 4 5 6 7
1214 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1215 }
1216
1217 // do the same for zw components
1218 Value* vi128ZW = nullptr;
1219 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1220 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1221 }
1222
1223 // init denormalize variables if needed
1224 Instruction::CastOps fpCast;
1225 Value* conversionFactor;
1226
1227 switch (conversionType)
1228 {
1229 case CONVERT_NORMALIZED:
1230 fpCast = Instruction::CastOps::SIToFP;
1231 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1232 break;
1233 case CONVERT_SSCALED:
1234 fpCast = Instruction::CastOps::SIToFP;
1235 conversionFactor = VIMMED1((float)(1.0));
1236 break;
1237 case CONVERT_USCALED:
1238 SWR_ASSERT(0, "Type should not be sign extended!");
1239 conversionFactor = nullptr;
1240 break;
1241 default:
1242 SWR_ASSERT(conversionType == CONVERT_NONE);
1243 conversionFactor = nullptr;
1244 break;
1245 }
1246
1247 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1248 for (uint32_t i = 0; i < 4; i++)
1249 {
1250 if (isComponentEnabled(compMask, i))
1251 {
1252 if (compCtrl[i] == ComponentControl::StoreSrc)
1253 {
1254 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1255 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1256 // if x or y, use vi128XY permute result, else use vi128ZW
1257 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1258
1259 // sign extend
1260 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1261
1262 // denormalize if needed
1263 if (conversionType != CONVERT_NONE)
1264 {
1265 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1266 }
1267 currentVertexElement++;
1268 }
1269 else
1270 {
1271 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1272 }
1273
1274 if (currentVertexElement > 3)
1275 {
1276 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1277 // reset to the next vVertexElement to output
1278 currentVertexElement = 0;
1279 }
1280 }
1281 }
1282 }
1283 // else zero extend
1284 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1285 {
1286 // init denormalize variables if needed
1287 Instruction::CastOps fpCast;
1288 Value* conversionFactor;
1289
1290 switch (conversionType)
1291 {
1292 case CONVERT_NORMALIZED:
1293 fpCast = Instruction::CastOps::UIToFP;
1294 conversionFactor = VIMMED1((float)(1.0 / 255.0));
1295 break;
1296 case CONVERT_USCALED:
1297 fpCast = Instruction::CastOps::UIToFP;
1298 conversionFactor = VIMMED1((float)(1.0));
1299 break;
1300 case CONVERT_SSCALED:
1301 SWR_ASSERT(0, "Type should not be zero extended!");
1302 conversionFactor = nullptr;
1303 break;
1304 default:
1305 SWR_ASSERT(conversionType == CONVERT_NONE);
1306 conversionFactor = nullptr;
1307 break;
1308 }
1309
1310 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1311 for (uint32_t i = 0; i < 4; i++)
1312 {
1313 if (isComponentEnabled(compMask, i))
1314 {
1315 if (compCtrl[i] == ComponentControl::StoreSrc)
1316 {
1317 // pshufb masks for each component
1318 Value* vConstMask;
1319 switch (swizzle[i])
1320 {
1321 case 0:
1322 // x shuffle mask
1323 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1324 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
1325 break;
1326 case 1:
1327 // y shuffle mask
1328 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1329 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
1330 break;
1331 case 2:
1332 // z shuffle mask
1333 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1334 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
1335 break;
1336 case 3:
1337 // w shuffle mask
1338 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1339 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
1340 break;
1341 default:
1342 vConstMask = nullptr;
1343 break;
1344 }
1345
1346 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1347 // after pshufb for x channel
1348 // 256i - 0 1 2 3 4 5 6 7
1349 // x000 x000 x000 x000 x000 x000 x000 x000
1350
1351 // denormalize if needed
1352 if (conversionType != CONVERT_NONE)
1353 {
1354 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1355 }
1356 currentVertexElement++;
1357 }
1358 else
1359 {
1360 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1361 }
1362
1363 if (currentVertexElement > 3)
1364 {
1365 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1366 // reset to the next vVertexElement to output
1367 currentVertexElement = 0;
1368 }
1369 }
1370 }
1371 }
1372 else
1373 {
1374 SWR_ASSERT(0, "Unsupported conversion type");
1375 }
1376 }
1377
1378 //////////////////////////////////////////////////////////////////////////
1379 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1380 /// denormalizes if needed, converts to F32 if needed, and positions in
1381 // the proper SIMD rows to be output to the simdvertex structure
1382 /// @param args: (tuple of args, listed below)
1383 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1384 /// @param pVtxOut - base pointer to output simdvertex struct
1385 /// @param extendType - sign extend or zero extend
1386 /// @param bNormalized - do we need to denormalize?
1387 /// @param currentVertexElement - reference to the current vVertexElement
1388 /// @param outputElt - reference to the current offset from simdvertex we're o
1389 /// @param compMask - component packing mask
1390 /// @param compCtrl - component control val
1391 /// @param vVertexElements[4] - vertex components to output
1392 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1393 {
1394 // Unpack tuple args
1395 Value* (&vGatherResult)[2] = std::get<0>(args);
1396 Value* pVtxOut = std::get<1>(args);
1397 const Instruction::CastOps extendType = std::get<2>(args);
1398 const ConversionType conversionType = std::get<3>(args);
1399 uint32_t &currentVertexElement = std::get<4>(args);
1400 uint32_t &outputElt = std::get<5>(args);
1401 const ComponentEnable compMask = std::get<6>(args);
1402 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1403 Value* (&vVertexElements)[4] = std::get<8>(args);
1404
1405 // cast types
1406 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1407 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1408
1409 // have to do extra work for sign extending
1410 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1411 (extendType == Instruction::CastOps::FPExt))
1412 {
1413 // is this PP float?
1414 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1415
1416 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1417 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1418
1419 // shuffle mask
1420 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1421 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1422 Value* vi128XY = nullptr;
1423 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1424 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1425 // after pshufb: group components together in each 128bit lane
1426 // 256i - 0 1 2 3 4 5 6 7
1427 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1428
1429 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1430 // after PERMD: move and pack xy components into each 128bit lane
1431 // 256i - 0 1 2 3 4 5 6 7
1432 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1433 }
1434
1435 // do the same for zw components
1436 Value* vi128ZW = nullptr;
1437 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1438 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1439 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1440 }
1441
1442 // init denormalize variables if needed
1443 Instruction::CastOps IntToFpCast;
1444 Value* conversionFactor;
1445
1446 switch (conversionType)
1447 {
1448 case CONVERT_NORMALIZED:
1449 IntToFpCast = Instruction::CastOps::SIToFP;
1450 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1451 break;
1452 case CONVERT_SSCALED:
1453 IntToFpCast = Instruction::CastOps::SIToFP;
1454 conversionFactor = VIMMED1((float)(1.0));
1455 break;
1456 case CONVERT_USCALED:
1457 SWR_ASSERT(0, "Type should not be sign extended!");
1458 conversionFactor = nullptr;
1459 break;
1460 default:
1461 SWR_ASSERT(conversionType == CONVERT_NONE);
1462 conversionFactor = nullptr;
1463 break;
1464 }
1465
1466 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1467 for (uint32_t i = 0; i < 4; i++)
1468 {
1469 if (isComponentEnabled(compMask, i))
1470 {
1471 if (compCtrl[i] == ComponentControl::StoreSrc)
1472 {
1473 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1474 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1475 // if x or y, use vi128XY permute result, else use vi128ZW
1476 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1477
1478 if (bFP) {
1479 // extract 128 bit lanes to sign extend each component
1480 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1481 }
1482 else {
1483 // extract 128 bit lanes to sign extend each component
1484 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1485
1486 // denormalize if needed
1487 if (conversionType != CONVERT_NONE) {
1488 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1489 }
1490 }
1491 currentVertexElement++;
1492 }
1493 else
1494 {
1495 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1496 }
1497
1498 if (currentVertexElement > 3)
1499 {
1500 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1501 // reset to the next vVertexElement to output
1502 currentVertexElement = 0;
1503 }
1504 }
1505 }
1506 }
1507 // else zero extend
1508 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1509 {
1510 // pshufb masks for each component
1511 Value* vConstMask[2];
1512 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1513 // x/z shuffle mask
1514 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1515 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1516 }
1517
1518 if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1519 // y/w shuffle mask
1520 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1521 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1522 }
1523
1524 // init denormalize variables if needed
1525 Instruction::CastOps fpCast;
1526 Value* conversionFactor;
1527
1528 switch (conversionType)
1529 {
1530 case CONVERT_NORMALIZED:
1531 fpCast = Instruction::CastOps::UIToFP;
1532 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1533 break;
1534 case CONVERT_USCALED:
1535 fpCast = Instruction::CastOps::UIToFP;
1536 conversionFactor = VIMMED1((float)(1.0f));
1537 break;
1538 case CONVERT_SSCALED:
1539 SWR_ASSERT(0, "Type should not be zero extended!");
1540 conversionFactor = nullptr;
1541 break;
1542 default:
1543 SWR_ASSERT(conversionType == CONVERT_NONE);
1544 conversionFactor = nullptr;
1545 break;
1546 }
1547
1548 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1549 for (uint32_t i = 0; i < 4; i++)
1550 {
1551 if (isComponentEnabled(compMask, i))
1552 {
1553 if (compCtrl[i] == ComponentControl::StoreSrc)
1554 {
1555 // select correct constMask for x/z or y/w pshufb
1556 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1557 // if x or y, use vi128XY permute result, else use vi128ZW
1558 uint32_t selectedGather = (i < 2) ? 0 : 1;
1559
1560 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1561 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1562 // 256i - 0 1 2 3 4 5 6 7
1563 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1564
1565 // denormalize if needed
1566 if (conversionType != CONVERT_NONE)
1567 {
1568 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1569 }
1570 currentVertexElement++;
1571 }
1572 else
1573 {
1574 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1575 }
1576
1577 if (currentVertexElement > 3)
1578 {
1579 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1580 // reset to the next vVertexElement to output
1581 currentVertexElement = 0;
1582 }
1583 }
1584 }
1585 }
1586 else
1587 {
1588 SWR_ASSERT(0, "Unsupported conversion type");
1589 }
1590 }
1591
1592 //////////////////////////////////////////////////////////////////////////
1593 /// @brief Output a simdvertex worth of elements to the current outputElt
1594 /// @param pVtxOut - base address of VIN output struct
1595 /// @param outputElt - simdvertex offset in VIN to write to
1596 /// @param numEltsToStore - number of simdvertex rows to write out
1597 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1598 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1599 {
1600 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
1601
1602 for(uint32_t c = 0; c < numEltsToStore; ++c)
1603 {
1604 // STORE expects FP32 x vWidth type, just bitcast if needed
1605 if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1606 #if FETCH_DUMP_VERTEX
1607 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1608 #endif
1609 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1610 }
1611 #if FETCH_DUMP_VERTEX
1612 else
1613 {
1614 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1615 }
1616 #endif
1617 // outputElt * 4 = offsetting by the size of a simdvertex
1618 // + c offsets to a 32bit x vWidth row within the current vertex
1619 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1620 STORE(vVertexElements[c], dest);
1621 }
1622 }
1623
1624 //////////////////////////////////////////////////////////////////////////
1625 /// @brief Generates a constant vector of values based on the
1626 /// ComponentControl value
1627 /// @param ctrl - ComponentControl value
1628 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1629 {
1630 switch(ctrl)
1631 {
1632 case NoStore: return VUNDEF_I();
1633 case Store0: return VIMMED1(0);
1634 case Store1Fp: return VIMMED1(1.0f);
1635 case Store1Int: return VIMMED1(1);
1636 case StoreVertexId:
1637 {
1638 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
1639 return VBROADCAST(pId);
1640 }
1641 case StoreInstanceId:
1642 {
1643 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
1644 return VBROADCAST(pId);
1645 }
1646 case StoreSrc:
1647 default: SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1648 }
1649 }
1650
1651 //////////////////////////////////////////////////////////////////////////
1652 /// @brief Returns the enable mask for the specified component.
1653 /// @param enableMask - enable bits
1654 /// @param component - component to check if enabled.
1655 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1656 {
1657 switch (component)
1658 {
1659 // X
1660 case 0: return (enableMask & ComponentEnable::X);
1661 // Y
1662 case 1: return (enableMask & ComponentEnable::Y);
1663 // Z
1664 case 2: return (enableMask & ComponentEnable::Z);
1665 // W
1666 case 3: return (enableMask & ComponentEnable::W);
1667
1668 default: return false;
1669 }
1670 }
1671
1672
1673 //////////////////////////////////////////////////////////////////////////
1674 /// @brief JITs from fetch shader IR
1675 /// @param hJitMgr - JitManager handle
1676 /// @param func - LLVM function IR
1677 /// @return PFN_FETCH_FUNC - pointer to fetch code
1678 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1679 {
1680 const llvm::Function* func = (const llvm::Function*)hFunc;
1681 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1682 PFN_FETCH_FUNC pfnFetch;
1683
1684 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1685 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1686 pJitMgr->mIsModuleFinalized = true;
1687
1688 #if defined(KNOB_SWRC_TRACING)
1689 char fName[1024];
1690 const char *funcName = func->getName().data();
1691 sprintf(fName, "%s.bin", funcName);
1692 FILE *fd = fopen(fName, "wb");
1693 fwrite((void *)pfnFetch, 1, 2048, fd);
1694 fclose(fd);
1695 #endif
1696
1697 return pfnFetch;
1698 }
1699
1700 //////////////////////////////////////////////////////////////////////////
1701 /// @brief JIT compiles fetch shader
1702 /// @param hJitMgr - JitManager handle
1703 /// @param state - fetch state to build function from
1704 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1705 {
1706 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1707
1708 pJitMgr->SetupNewModule();
1709
1710 FetchJit theJit(pJitMgr);
1711 HANDLE hFunc = theJit.Create(state);
1712
1713 return JitFetchFunc(hJitMgr, hFunc);
1714 }