ba1096dc21487af9e5272393d5625dec4d488274
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_api.h"
31 #include "fetch_jit.h"
32 #include "builder.h"
33 #include "state_llvm.h"
34 #include "common/containers.hpp"
35 #include "llvm/IR/DataLayout.h"
36 #include <sstream>
37 #include <tuple>
38
39 //#define FETCH_DUMP_VERTEX 1
40
41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43 enum ConversionType
44 {
45 CONVERT_NONE,
46 CONVERT_NORMALIZED,
47 CONVERT_USCALED,
48 CONVERT_SSCALED,
49 };
50
51 //////////////////////////////////////////////////////////////////////////
52 /// Interface to Jitting a fetch shader
53 //////////////////////////////////////////////////////////////////////////
54 struct FetchJit : public Builder
55 {
56 FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
57
58 Function* Create(const FETCH_COMPILE_STATE& fetchState);
59 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
60 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
61 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
62
63 // package up Shuffle*bpcGatherd args into a tuple for convenience
64 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
65 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
66 const uint32_t (&)[4]> Shuffle8bpcArgs;
67 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
68
69 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
70 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
71 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
72
73 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
74
75 Value* GenerateCompCtrlVector(const ComponentControl ctrl);
76
77 void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
78 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
79
80 bool IsOddFormat(SWR_FORMAT format);
81 bool IsUniformFormat(SWR_FORMAT format);
82 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
83 void CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4]);
84 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
85
86 };
87
88 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
89 {
90 static std::size_t fetchNum = 0;
91
92 std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
93 fnName << fetchNum++;
94
95 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
96 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
97
98 IRB()->SetInsertPoint(entry);
99
100 auto argitr = fetch->getArgumentList().begin();
101
102 // Fetch shader arguments
103 Value* fetchInfo = &*argitr; ++argitr;
104 fetchInfo->setName("fetchInfo");
105 Value* pVtxOut = &*argitr;
106 pVtxOut->setName("vtxOutput");
107 // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
108 // index 0(just the pointer to the simdvertex structure
109 // index 1(which element of the simdvertex structure to offset to(in this case 0)
110 // so the indices being i32's doesn't matter
111 // TODO: generated this GEP with a VECTOR structure type so this makes sense
112 std::vector<Value*> vtxInputIndices(2, C(0));
113 // GEP
114 pVtxOut = GEP(pVtxOut, C(0));
115 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
116
117 // SWR_FETCH_CONTEXT::pStreams
118 Value* streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
119 streams->setName("pStreams");
120
121 // SWR_FETCH_CONTEXT::pIndices
122 Value* indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
123 indices->setName("pIndices");
124
125 // SWR_FETCH_CONTEXT::pLastIndex
126 Value* pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
127 pLastIndex->setName("pLastIndex");
128
129
130 Value* vIndices;
131 switch(fetchState.indexType)
132 {
133 case R8_UINT:
134 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
135 if(fetchState.bDisableIndexOOBCheck){
136 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
137 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
138 }
139 else{
140 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
141 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
142 }
143 break;
144 case R16_UINT:
145 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
146 if(fetchState.bDisableIndexOOBCheck){
147 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
148 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
149 }
150 else{
151 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
152 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
153 }
154 break;
155 case R32_UINT:
156 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
157 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
158 break; // incoming type is already 32bit int
159 default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
160 }
161
162 // store out vertex IDs
163 STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
164
165 // store out cut mask if enabled
166 if (fetchState.bEnableCutIndex)
167 {
168 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
169 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
170 STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
171 }
172
173 // Fetch attributes from memory and output to a simdvertex struct
174 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
175 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut)
176 : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut);
177
178 RET_VOID();
179
180 JitManager::DumpToFile(fetch, "src");
181
182 verifyFunction(*fetch);
183
184 #if HAVE_LLVM == 0x306
185 FunctionPassManager
186 #else
187 llvm::legacy::FunctionPassManager
188 #endif
189 setupPasses(JM()->mpCurrentModule);
190
191 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
192 setupPasses.add(createBreakCriticalEdgesPass());
193 setupPasses.add(createCFGSimplificationPass());
194 setupPasses.add(createEarlyCSEPass());
195 setupPasses.add(createPromoteMemoryToRegisterPass());
196
197 setupPasses.run(*fetch);
198
199 JitManager::DumpToFile(fetch, "se");
200
201 #if HAVE_LLVM == 0x306
202 FunctionPassManager
203 #else
204 llvm::legacy::FunctionPassManager
205 #endif
206 optPasses(JM()->mpCurrentModule);
207
208 ///@todo Haven't touched these either. Need to remove some of these and add others.
209 optPasses.add(createCFGSimplificationPass());
210 optPasses.add(createEarlyCSEPass());
211 optPasses.add(createInstructionCombiningPass());
212 optPasses.add(createInstructionSimplifierPass());
213 optPasses.add(createConstantPropagationPass());
214 optPasses.add(createSCCPPass());
215 optPasses.add(createAggressiveDCEPass());
216
217 optPasses.run(*fetch);
218 optPasses.run(*fetch);
219
220 JitManager::DumpToFile(fetch, "opt");
221
222 return fetch;
223 }
224
225 //////////////////////////////////////////////////////////////////////////
226 /// @brief Loads attributes from memory using LOADs, shuffling the
227 /// components into SOA form.
228 /// *Note* currently does not support component control,
229 /// component packing, or instancing
230 /// @param fetchState - info about attributes to be fetched from memory
231 /// @param streams - value pointer to the current vertex stream
232 /// @param vIndices - vector value of indices to load
233 /// @param pVtxOut - value pointer to output simdvertex struct
234 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut)
235 {
236 // Zack shuffles; a variant of the Charleston.
237
238 SWRL::UncheckedFixedVector<Value*, 16> vectors;
239
240 std::vector<Constant*> pMask(mVWidth);
241 for(uint32_t i = 0; i < mVWidth; ++i)
242 {
243 pMask[i] = (C(i < 4 ? i : 4));
244 }
245 Constant* promoteMask = ConstantVector::get(pMask);
246 Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
247
248 Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
249
250 for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
251 {
252 Value* elements[4] = {0};
253 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
254 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
255 uint32_t numComponents = info.numComps;
256 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
257
258 vectors.clear();
259
260 // load SWR_VERTEX_BUFFER_STATE::pData
261 Value *stream = LOAD(streams, {ied.StreamIndex, 2});
262
263 // load SWR_VERTEX_BUFFER_STATE::pitch
264 Value *stride = LOAD(streams, {ied.StreamIndex, 1});
265 stride = Z_EXT(stride, mInt64Ty);
266
267 // load SWR_VERTEX_BUFFER_STATE::size
268 Value *size = LOAD(streams, {ied.StreamIndex, 3});
269 size = Z_EXT(size, mInt64Ty);
270
271 Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
272
273 // Load from the stream.
274 for(uint32_t lane = 0; lane < mVWidth; ++lane)
275 {
276 // Get index
277 Value* index = VEXTRACT(vIndices, C(lane));
278 index = Z_EXT(index, mInt64Ty);
279
280 Value* offset = MUL(index, stride);
281 offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
282 offset = ADD(offset, startVertexOffset);
283
284 if (!fetchState.bDisableIndexOOBCheck) {
285 // check for out of bound access, including partial OOB, and mask them to 0
286 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
287 Value *oob = ICMP_ULE(endOffset, size);
288 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
289 }
290
291 Value* pointer = GEP(stream, offset);
292 // We use a full-lane, but don't actually care.
293 Value* vptr = 0;
294
295 // get a pointer to a 4 component attrib in default address space
296 switch(bpc)
297 {
298 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
299 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
300 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
301 default: SWR_ASSERT(false, "Unsupported underlying bpp!");
302 }
303
304 // load 4 components of attribute
305 Value* vec = ALIGNED_LOAD(vptr, 1, false);
306
307 // Convert To FP32 internally
308 switch(info.type[0])
309 {
310 case SWR_TYPE_UNORM:
311 switch(bpc)
312 {
313 case 8:
314 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
315 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
316 break;
317 case 16:
318 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
319 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
320 break;
321 default:
322 SWR_ASSERT(false, "Unsupported underlying type!");
323 break;
324 }
325 break;
326 case SWR_TYPE_SNORM:
327 switch(bpc)
328 {
329 case 8:
330 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
331 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
332 break;
333 case 16:
334 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
335 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
336 break;
337 default:
338 SWR_ASSERT(false, "Unsupported underlying type!");
339 break;
340 }
341 break;
342 case SWR_TYPE_UINT:
343 // Zero extend uint32_t types.
344 switch(bpc)
345 {
346 case 8:
347 case 16:
348 vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
349 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
350 break;
351 case 32:
352 break; // Pass through unchanged.
353 default:
354 SWR_ASSERT(false, "Unsupported underlying type!");
355 break;
356 }
357 break;
358 case SWR_TYPE_SINT:
359 // Sign extend SINT types.
360 switch(bpc)
361 {
362 case 8:
363 case 16:
364 vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
365 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
366 break;
367 case 32:
368 break; // Pass through unchanged.
369 default:
370 SWR_ASSERT(false, "Unsupported underlying type!");
371 break;
372 }
373 break;
374 case SWR_TYPE_FLOAT:
375 switch(bpc)
376 {
377 case 32:
378 break; // Pass through unchanged.
379 default:
380 SWR_ASSERT(false, "Unsupported underlying type!");
381 }
382 break;
383 case SWR_TYPE_USCALED:
384 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
385 break;
386 case SWR_TYPE_SSCALED:
387 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
388 break;
389 case SWR_TYPE_UNKNOWN:
390 case SWR_TYPE_UNUSED:
391 SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
392 }
393
394 // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
395 // uwvec: 4 x F32, undef value
396 Value* wvec = VSHUFFLE(vec, uwvec, promoteMask);
397 vectors.push_back(wvec);
398 }
399
400 std::vector<Constant*> v01Mask(mVWidth);
401 std::vector<Constant*> v23Mask(mVWidth);
402 std::vector<Constant*> v02Mask(mVWidth);
403 std::vector<Constant*> v13Mask(mVWidth);
404
405 // Concatenate the vectors together.
406 elements[0] = VUNDEF_F();
407 elements[1] = VUNDEF_F();
408 elements[2] = VUNDEF_F();
409 elements[3] = VUNDEF_F();
410 for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
411 {
412 v01Mask[4 * b + 0] = C(0 + 4 * b);
413 v01Mask[4 * b + 1] = C(1 + 4 * b);
414 v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
415 v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
416
417 v23Mask[4 * b + 0] = C(2 + 4 * b);
418 v23Mask[4 * b + 1] = C(3 + 4 * b);
419 v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
420 v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
421
422 v02Mask[4 * b + 0] = C(0 + 4 * b);
423 v02Mask[4 * b + 1] = C(2 + 4 * b);
424 v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
425 v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
426
427 v13Mask[4 * b + 0] = C(1 + 4 * b);
428 v13Mask[4 * b + 1] = C(3 + 4 * b);
429 v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
430 v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
431
432 std::vector<Constant*> iMask(mVWidth);
433 for(uint32_t i = 0; i < mVWidth; ++i)
434 {
435 if(((4 * b) <= i) && (i < (4 * (b + 1))))
436 {
437 iMask[i] = C(i % 4 + mVWidth);
438 }
439 else
440 {
441 iMask[i] = C(i);
442 }
443 }
444 Constant* insertMask = ConstantVector::get(iMask);
445 elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
446 elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
447 elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
448 elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
449 }
450
451 Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
452 Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
453 Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
454 Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
455 elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
456 elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
457 elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
458 elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
459
460 switch(numComponents + 1)
461 {
462 case 1: elements[0] = VIMMED1(0.0f);
463 case 2: elements[1] = VIMMED1(0.0f);
464 case 3: elements[2] = VIMMED1(0.0f);
465 case 4: elements[3] = VIMMED1(1.0f);
466 }
467
468 for(uint32_t c = 0; c < 4; ++c)
469 {
470 Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
471 STORE(elements[c], dest);
472 }
473 }
474 }
475
476 // returns true for odd formats that require special state.gather handling
477 bool FetchJit::IsOddFormat(SWR_FORMAT format)
478 {
479 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
480 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32)
481 {
482 return true;
483 }
484 return false;
485 }
486
487 // format is uniform if all components are the same size and type
488 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
489 {
490 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
491 uint32_t bpc0 = info.bpc[0];
492 uint32_t type0 = info.type[0];
493
494 for (uint32_t c = 1; c < info.numComps; ++c)
495 {
496 if (bpc0 != info.bpc[c] || type0 != info.type[c])
497 {
498 return false;
499 }
500 }
501 return true;
502 }
503
504 // unpacks components based on format
505 // foreach component in the pixel
506 // mask off everything but this component
507 // shift component to LSB
508 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
509 {
510 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
511
512 uint32_t bitOffset = 0;
513 for (uint32_t c = 0; c < info.numComps; ++c)
514 {
515 uint32_t swizzledIndex = info.swizzle[c];
516 uint32_t compBits = info.bpc[c];
517 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
518 Value* comp = AND(vInput, bitmask);
519 comp = LSHR(comp, bitOffset);
520
521 result[swizzledIndex] = comp;
522 bitOffset += compBits;
523 }
524 }
525
526 // gather for odd component size formats
527 // gather SIMD full pixels per lane then shift/mask to move each component to their
528 // own vector
529 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pBase, Value* offsets, Value* result[4])
530 {
531 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
532
533 // only works if pixel size is <= 32bits
534 SWR_ASSERT(info.bpp <= 32);
535
536 Value* gather = VUNDEF_I();
537
538 // assign defaults
539 for (uint32_t comp = 0; comp < 4; ++comp)
540 {
541 result[comp] = VIMMED1((int)info.defaults[comp]);
542 }
543
544 // gather SIMD pixels
545 for (uint32_t e = 0; e < JM()->mVWidth; ++e)
546 {
547 Value* elemOffset = VEXTRACT(offsets, C(e));
548 Value* load = GEP(pBase, elemOffset);
549
550 // load the proper amount of data based on component size
551 switch (info.bpp)
552 {
553 case 8: load = POINTER_CAST(load, Type::getInt8PtrTy(JM()->mContext)); break;
554 case 16: load = POINTER_CAST(load, Type::getInt16PtrTy(JM()->mContext)); break;
555 case 32: load = POINTER_CAST(load, Type::getInt32PtrTy(JM()->mContext)); break;
556 default: SWR_ASSERT(0);
557 }
558
559 // load pixel
560 Value *val = LOAD(load);
561
562 // zero extend to 32bit integer
563 val = INT_CAST(val, mInt32Ty, false);
564
565 // store in simd lane
566 gather = VINSERT(gather, val, C(e));
567 }
568
569 UnpackComponents(format, gather, result);
570
571 // cast to fp32
572 result[0] = BITCAST(result[0], mSimdFP32Ty);
573 result[1] = BITCAST(result[1], mSimdFP32Ty);
574 result[2] = BITCAST(result[2], mSimdFP32Ty);
575 result[3] = BITCAST(result[3], mSimdFP32Ty);
576 }
577
578 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
579 {
580 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
581
582 for (uint32_t c = 0; c < info.numComps; ++c)
583 {
584 uint32_t compIndex = info.swizzle[c];
585
586 // skip any conversion on UNUSED components
587 if (info.type[c] == SWR_TYPE_UNUSED)
588 {
589 continue;
590 }
591
592 if (info.isNormalized[c])
593 {
594 if (info.type[c] == SWR_TYPE_SNORM)
595 {
596 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
597
598 /// result = c * (1.0f / (2^(n-1) - 1);
599 uint32_t n = info.bpc[c];
600 uint32_t pow2 = 1 << (n - 1);
601 float scale = 1.0f / (float)(pow2 - 1);
602 Value *vScale = VIMMED1(scale);
603 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
604 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
605 texels[compIndex] = FMUL(texels[compIndex], vScale);
606 }
607 else
608 {
609 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
610
611 /// result = c * (1.0f / (2^n - 1))
612 uint32_t n = info.bpc[c];
613 uint32_t pow2 = 1 << n;
614 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
615 if (n == 24)
616 {
617 float scale = (float)(pow2 - 1);
618 Value* vScale = VIMMED1(scale);
619 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
620 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
621 texels[compIndex] = FDIV(texels[compIndex], vScale);
622 }
623 else
624 {
625 float scale = 1.0f / (float)(pow2 - 1);
626 Value *vScale = VIMMED1(scale);
627 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
628 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
629 texels[compIndex] = FMUL(texels[compIndex], vScale);
630 }
631 }
632 continue;
633 }
634 }
635 }
636
637 //////////////////////////////////////////////////////////////////////////
638 /// @brief Loads attributes from memory using AVX2 GATHER(s)
639 /// @param fetchState - info about attributes to be fetched from memory
640 /// @param fetchInfo - first argument passed to fetch shader
641 /// @param streams - value pointer to the current vertex stream
642 /// @param vIndices - vector value of indices to gather
643 /// @param pVtxOut - value pointer to output simdvertex struct
644 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo,
645 Value* streams, Value* vIndices, Value* pVtxOut)
646 {
647 uint32_t currentVertexElement = 0;
648 uint32_t outputElt = 0;
649 Value* vVertexElements[4];
650
651 Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
652 Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
653 Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
654 Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
655 curInstance->setName("curInstance");
656
657 for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
658 {
659 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
660 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
661 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
662
663 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
664
665 // VGATHER* takes an *i8 src pointer
666 Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
667
668 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
669 Value *vStride = VBROADCAST(stride);
670
671 // max vertex index that is fully in bounds
672 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
673 maxVertex = LOAD(maxVertex);
674
675 Value *vCurIndices;
676 Value *startOffset;
677 if(ied.InstanceEnable)
678 {
679 Value* stepRate = C(ied.InstanceDataStepRate);
680
681 // prevent a div by 0 for 0 step rate
682 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
683 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
684
685 // calc the current offset into instanced data buffer
686 Value* calcInstance = UDIV(curInstance, stepRate);
687
688 // if step rate is 0, every instance gets instance 0
689 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
690
691 vCurIndices = VBROADCAST(calcInstance);
692
693 startOffset = startInstance;
694 }
695 else
696 {
697 // offset indices by baseVertex
698 vCurIndices = ADD(vIndices, vBaseVertex);
699
700 startOffset = startVertex;
701 }
702
703 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
704 // do 64bit address offset calculations.
705
706 // calculate byte offset to the start of the VB
707 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
708 pStreamBase = GEP(pStreamBase, baseOffset);
709
710 // if we have a start offset, subtract from max vertex. Used for OOB check
711 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
712 Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
713 // if we have a negative value, we're already OOB. clamp at 0.
714 maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
715
716 // Load the in bounds size of a partially valid vertex
717 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
718 partialInboundsSize = LOAD(partialInboundsSize);
719 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
720 Value* vBpp = VBROADCAST(C(info.Bpp));
721 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
722
723 // is the element is <= the partially valid size
724 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
725
726 // are vertices partially OOB?
727 Value* vMaxVertex = VBROADCAST(maxVertex);
728 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
729
730 // are vertices are fully in bounds?
731 Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
732
733 // blend in any partially OOB indices that have valid elements
734 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
735 vGatherMask = VMASK(vGatherMask);
736
737 // calculate the actual offsets into the VB
738 Value* vOffsets = MUL(vCurIndices, vStride);
739 vOffsets = ADD(vOffsets, vAlignmentOffsets);
740
741 // Packing and component control
742 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
743 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
744 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
745
746 // Special gather/conversion for formats without equal component sizes
747 if (IsOddFormat((SWR_FORMAT)ied.Format))
748 {
749 // Only full 4 component fetch is supported for odd formats
750 SWR_ASSERT(compMask == XYZW);
751 Value* pResults[4];
752 CreateGatherOddFormats((SWR_FORMAT)ied.Format, pStreamBase, vOffsets, pResults);
753 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
754
755 StoreVertexElements(pVtxOut, outputElt++, 4, pResults);
756 currentVertexElement = 0;
757 }
758 else if(info.type[0] == SWR_TYPE_FLOAT)
759 {
760 ///@todo: support 64 bit vb accesses
761 Value* gatherSrc = VIMMED1(0.0f);
762
763 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
764 "Unsupported format for standard gather fetch.");
765
766 // Gather components from memory to store in a simdvertex structure
767 switch(bpc)
768 {
769 case 16:
770 {
771 Value* vGatherResult[2];
772 Value *vMask;
773
774 // if we have at least one component out of x or y to fetch
775 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
776 // save mask as it is zero'd out after each gather
777 vMask = vGatherMask;
778
779 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
780 // e.g. result of first 8x32bit integer gather for 16bit components
781 // 256i - 0 1 2 3 4 5 6 7
782 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
783 //
784 }
785
786 // if we have at least one component out of z or w to fetch
787 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
788 // offset base to the next components(zw) in the vertex to gather
789 pStreamBase = GEP(pStreamBase, C((char)4));
790 vMask = vGatherMask;
791
792 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
793 // e.g. result of second 8x32bit integer gather for 16bit components
794 // 256i - 0 1 2 3 4 5 6 7
795 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
796 //
797 }
798
799 // if we have at least one component to shuffle into place
800 if(compMask){
801 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
802 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
803 // Shuffle gathered components into place in simdvertex struct
804 Shuffle16bpcGather(args); // outputs to vVertexElements ref
805 }
806 }
807 break;
808 case 32:
809 {
810 for(uint32_t i = 0; i < 4; i++)
811 {
812 if(!isComponentEnabled(compMask, i)){
813 // offset base to the next component in the vertex to gather
814 pStreamBase = GEP(pStreamBase, C((char)4));
815 continue;
816 }
817
818 // if we need to gather the component
819 if(compCtrl[i] == StoreSrc){
820 // save mask as it is zero'd out after each gather
821 Value *vMask = vGatherMask;
822
823 // Gather a SIMD of vertices
824 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
825 }
826 else{
827 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
828 }
829
830 if(currentVertexElement > 3){
831 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
832 // reset to the next vVertexElement to output
833 currentVertexElement = 0;
834 }
835
836 // offset base to the next component in the vertex to gather
837 pStreamBase = GEP(pStreamBase, C((char)4));
838 }
839 }
840 break;
841 default:
842 SWR_ASSERT(0, "Tried to fetch invalid FP format");
843 break;
844 }
845 }
846 else
847 {
848 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
849 ConversionType conversionType = CONVERT_NONE;
850
851 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
852 "Unsupported format for standard gather fetch.");
853
854 switch(info.type[0])
855 {
856 case SWR_TYPE_UNORM:
857 conversionType = CONVERT_NORMALIZED;
858 case SWR_TYPE_UINT:
859 extendCastType = Instruction::CastOps::ZExt;
860 break;
861 case SWR_TYPE_SNORM:
862 conversionType = CONVERT_NORMALIZED;
863 case SWR_TYPE_SINT:
864 extendCastType = Instruction::CastOps::SExt;
865 break;
866 case SWR_TYPE_USCALED:
867 conversionType = CONVERT_USCALED;
868 extendCastType = Instruction::CastOps::UIToFP;
869 break;
870 case SWR_TYPE_SSCALED:
871 conversionType = CONVERT_SSCALED;
872 extendCastType = Instruction::CastOps::SIToFP;
873 break;
874 default:
875 break;
876 }
877
878 // value substituted when component of gather is masked
879 Value* gatherSrc = VIMMED1(0);
880
881 // Gather components from memory to store in a simdvertex structure
882 switch (bpc)
883 {
884 case 8:
885 {
886 // if we have at least one component to fetch
887 if(compMask){
888 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
889 // e.g. result of an 8x32bit integer gather for 8bit components
890 // 256i - 0 1 2 3 4 5 6 7
891 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
892
893 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
894 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
895 // Shuffle gathered components into place in simdvertex struct
896 Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
897 }
898 }
899 break;
900 case 16:
901 {
902 Value* vGatherResult[2];
903 Value *vMask;
904
905 // if we have at least one component out of x or y to fetch
906 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
907 // save mask as it is zero'd out after each gather
908 vMask = vGatherMask;
909
910 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
911 // e.g. result of first 8x32bit integer gather for 16bit components
912 // 256i - 0 1 2 3 4 5 6 7
913 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
914 //
915 }
916
917 // if we have at least one component out of z or w to fetch
918 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
919 // offset base to the next components(zw) in the vertex to gather
920 pStreamBase = GEP(pStreamBase, C((char)4));
921 vMask = vGatherMask;
922
923 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
924 // e.g. result of second 8x32bit integer gather for 16bit components
925 // 256i - 0 1 2 3 4 5 6 7
926 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
927 //
928 }
929
930 // if we have at least one component to shuffle into place
931 if(compMask){
932 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
933 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
934 // Shuffle gathered components into place in simdvertex struct
935 Shuffle16bpcGather(args); // outputs to vVertexElements ref
936 }
937 }
938 break;
939 case 32:
940 {
941 SWR_ASSERT(conversionType == CONVERT_NONE);
942
943 // Gathered components into place in simdvertex struct
944 for(uint32_t i = 0; i < 4; i++)
945 {
946 if(!isComponentEnabled(compMask, i)){
947 // offset base to the next component in the vertex to gather
948 pStreamBase = GEP(pStreamBase, C((char)4));
949 continue;
950 }
951
952 // if we need to gather the component
953 if(compCtrl[i] == StoreSrc){
954 // save mask as it is zero'd out after each gather
955 Value *vMask = vGatherMask;
956
957 vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
958
959 // e.g. result of a single 8x32bit integer gather for 32bit components
960 // 256i - 0 1 2 3 4 5 6 7
961 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
962 }
963 else{
964 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
965 }
966
967 if(currentVertexElement > 3){
968 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
969 // reset to the next vVertexElement to output
970 currentVertexElement = 0;
971 }
972
973 // offset base to the next component in the vertex to gather
974 pStreamBase = GEP(pStreamBase, C((char)4));
975 }
976 }
977 break;
978 }
979 }
980 }
981
982 // if we have a partially filled vVertexElement struct, output it
983 if(currentVertexElement > 0){
984 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement+1, vVertexElements);
985 }
986 }
987
988 //////////////////////////////////////////////////////////////////////////
989 /// @brief Loads a simd of valid indices. OOB indices are set to 0
990 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
991 /// support
992 /// @param pIndices - pointer to 8 bit indices
993 /// @param pLastIndex - pointer to last valid index
994 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
995 {
996 // can fit 2 16 bit integers per vWidth lane
997 Value* vIndices = VUNDEF_I();
998
999 // store 0 index on stack to be used to conditionally load from if index address is OOB
1000 Value* pZeroIndex = ALLOCA(mInt8Ty);
1001 STORE(C((uint8_t)0), pZeroIndex);
1002
1003 // Load a SIMD of index pointers
1004 for(int64_t lane = 0; lane < mVWidth; lane++)
1005 {
1006 // Calculate the address of the requested index
1007 Value *pIndex = GEP(pIndices, C(lane));
1008
1009 // check if the address is less than the max index,
1010 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1011
1012 // if valid, load the index. if not, load 0 from the stack
1013 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1014 Value *index = LOAD(pValid, "valid index");
1015
1016 // zero extended index to 32 bits and insert into the correct simd lane
1017 index = Z_EXT(index, mInt32Ty);
1018 vIndices = VINSERT(vIndices, index, lane);
1019 }
1020 return vIndices;
1021 }
1022
1023 //////////////////////////////////////////////////////////////////////////
1024 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1025 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1026 /// support
1027 /// @param pIndices - pointer to 16 bit indices
1028 /// @param pLastIndex - pointer to last valid index
1029 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1030 {
1031 // can fit 2 16 bit integers per vWidth lane
1032 Value* vIndices = VUNDEF_I();
1033
1034 // store 0 index on stack to be used to conditionally load from if index address is OOB
1035 Value* pZeroIndex = ALLOCA(mInt16Ty);
1036 STORE(C((uint16_t)0), pZeroIndex);
1037
1038 // Load a SIMD of index pointers
1039 for(int64_t lane = 0; lane < mVWidth; lane++)
1040 {
1041 // Calculate the address of the requested index
1042 Value *pIndex = GEP(pIndices, C(lane));
1043
1044 // check if the address is less than the max index,
1045 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1046
1047 // if valid, load the index. if not, load 0 from the stack
1048 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1049 Value *index = LOAD(pValid, "valid index");
1050
1051 // zero extended index to 32 bits and insert into the correct simd lane
1052 index = Z_EXT(index, mInt32Ty);
1053 vIndices = VINSERT(vIndices, index, lane);
1054 }
1055 return vIndices;
1056 }
1057
1058 //////////////////////////////////////////////////////////////////////////
1059 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1060 /// @param pIndices - pointer to 32 bit indices
1061 /// @param pLastIndex - pointer to last valid index
1062 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1063 {
1064 DataLayout dL(JM()->mpCurrentModule);
1065 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1066 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1067 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1068
1069 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1070 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1071 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1072 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1073
1074 // create a vector of index counts from the base index ptr passed into the fetch
1075 const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1076 Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1077
1078 // compare index count to the max valid index
1079 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1080 // vIndexOffsets 0 1 2 3 4 5 6 7
1081 // ------------------------------
1082 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1083 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1084 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1085 Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1086
1087 // VMASKLOAD takes an *i8 src pointer
1088 pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1089
1090 // Load the indices; OOB loads 0
1091 return MASKLOADD(pIndices,vIndexMask);
1092 }
1093
1094 //////////////////////////////////////////////////////////////////////////
1095 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1096 /// denormalizes if needed, converts to F32 if needed, and positions in
1097 // the proper SIMD rows to be output to the simdvertex structure
1098 /// @param args: (tuple of args, listed below)
1099 /// @param vGatherResult - 8 gathered 8bpc vertices
1100 /// @param pVtxOut - base pointer to output simdvertex struct
1101 /// @param extendType - sign extend or zero extend
1102 /// @param bNormalized - do we need to denormalize?
1103 /// @param currentVertexElement - reference to the current vVertexElement
1104 /// @param outputElt - reference to the current offset from simdvertex we're o
1105 /// @param compMask - component packing mask
1106 /// @param compCtrl - component control val
1107 /// @param vVertexElements[4] - vertex components to output
1108 /// @param swizzle[4] - component swizzle location
1109 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1110 {
1111 // Unpack tuple args
1112 Value*& vGatherResult = std::get<0>(args);
1113 Value* pVtxOut = std::get<1>(args);
1114 const Instruction::CastOps extendType = std::get<2>(args);
1115 const ConversionType conversionType = std::get<3>(args);
1116 uint32_t &currentVertexElement = std::get<4>(args);
1117 uint32_t &outputElt = std::get<5>(args);
1118 const ComponentEnable compMask = std::get<6>(args);
1119 const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1120 Value* (&vVertexElements)[4] = std::get<8>(args);
1121 const uint32_t (&swizzle)[4] = std::get<9>(args);
1122
1123 // cast types
1124 Type* vGatherTy = mSimdInt32Ty;
1125 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1126
1127 // have to do extra work for sign extending
1128 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1129 Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1130 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1131
1132 // shuffle mask, including any swizzling
1133 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1134 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1135 Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1136 char(y), char(y+4), char(y+8), char(y+12),
1137 char(z), char(z+4), char(z+8), char(z+12),
1138 char(w), char(w+4), char(w+8), char(w+12),
1139 char(x), char(x+4), char(x+8), char(x+12),
1140 char(y), char(y+4), char(y+8), char(y+12),
1141 char(z), char(z+4), char(z+8), char(z+12),
1142 char(w), char(w+4), char(w+8), char(w+12)});
1143
1144 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1145 // after pshufb: group components together in each 128bit lane
1146 // 256i - 0 1 2 3 4 5 6 7
1147 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1148
1149 Value* vi128XY = nullptr;
1150 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1151 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1152 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1153 // 256i - 0 1 2 3 4 5 6 7
1154 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1155 }
1156
1157 // do the same for zw components
1158 Value* vi128ZW = nullptr;
1159 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1160 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1161 }
1162
1163 // init denormalize variables if needed
1164 Instruction::CastOps fpCast;
1165 Value* conversionFactor;
1166
1167 switch (conversionType)
1168 {
1169 case CONVERT_NORMALIZED:
1170 fpCast = Instruction::CastOps::SIToFP;
1171 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1172 break;
1173 case CONVERT_SSCALED:
1174 fpCast = Instruction::CastOps::SIToFP;
1175 conversionFactor = VIMMED1((float)(1.0));
1176 break;
1177 case CONVERT_USCALED:
1178 SWR_ASSERT(0, "Type should not be sign extended!");
1179 conversionFactor = nullptr;
1180 break;
1181 default:
1182 SWR_ASSERT(conversionType == CONVERT_NONE);
1183 conversionFactor = nullptr;
1184 break;
1185 }
1186
1187 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1188 for(uint32_t i = 0; i < 4; i++){
1189 if(!isComponentEnabled(compMask, i)){
1190 continue;
1191 }
1192
1193 if(compCtrl[i] == ComponentControl::StoreSrc){
1194 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1195 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1196 // if x or y, use vi128XY permute result, else use vi128ZW
1197 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1198
1199 // sign extend
1200 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1201
1202 // denormalize if needed
1203 if(conversionType != CONVERT_NONE){
1204 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1205 }
1206 currentVertexElement++;
1207 }
1208 else{
1209 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1210 }
1211
1212 if(currentVertexElement > 3){
1213 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1214 // reset to the next vVertexElement to output
1215 currentVertexElement = 0;
1216 }
1217 }
1218 }
1219 // else zero extend
1220 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1221 {
1222 // init denormalize variables if needed
1223 Instruction::CastOps fpCast;
1224 Value* conversionFactor;
1225
1226 switch (conversionType)
1227 {
1228 case CONVERT_NORMALIZED:
1229 fpCast = Instruction::CastOps::UIToFP;
1230 conversionFactor = VIMMED1((float)(1.0 / 255.0));
1231 break;
1232 case CONVERT_USCALED:
1233 fpCast = Instruction::CastOps::UIToFP;
1234 conversionFactor = VIMMED1((float)(1.0));
1235 break;
1236 case CONVERT_SSCALED:
1237 SWR_ASSERT(0, "Type should not be zero extended!");
1238 conversionFactor = nullptr;
1239 break;
1240 default:
1241 SWR_ASSERT(conversionType == CONVERT_NONE);
1242 conversionFactor = nullptr;
1243 break;
1244 }
1245
1246 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1247 for(uint32_t i = 0; i < 4; i++){
1248 if(!isComponentEnabled(compMask, i)){
1249 continue;
1250 }
1251
1252 if(compCtrl[i] == ComponentControl::StoreSrc){
1253 // pshufb masks for each component
1254 Value* vConstMask;
1255 switch(swizzle[i]){
1256 case 0:
1257 // x shuffle mask
1258 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1259 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1260 break;
1261 case 1:
1262 // y shuffle mask
1263 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1264 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1265 break;
1266 case 2:
1267 // z shuffle mask
1268 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1269 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1270 break;
1271 case 3:
1272 // w shuffle mask
1273 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1274 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1275 break;
1276 default:
1277 vConstMask = nullptr;
1278 break;
1279 }
1280
1281 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1282 // after pshufb for x channel
1283 // 256i - 0 1 2 3 4 5 6 7
1284 // x000 x000 x000 x000 x000 x000 x000 x000
1285
1286 // denormalize if needed
1287 if (conversionType != CONVERT_NONE){
1288 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1289 }
1290 currentVertexElement++;
1291 }
1292 else{
1293 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1294 }
1295
1296 if(currentVertexElement > 3){
1297 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1298 // reset to the next vVertexElement to output
1299 currentVertexElement = 0;
1300 }
1301 }
1302 }
1303 else
1304 {
1305 SWR_ASSERT(0, "Unsupported conversion type");
1306 }
1307 }
1308
1309 //////////////////////////////////////////////////////////////////////////
1310 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1311 /// denormalizes if needed, converts to F32 if needed, and positions in
1312 // the proper SIMD rows to be output to the simdvertex structure
1313 /// @param args: (tuple of args, listed below)
1314 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1315 /// @param pVtxOut - base pointer to output simdvertex struct
1316 /// @param extendType - sign extend or zero extend
1317 /// @param bNormalized - do we need to denormalize?
1318 /// @param currentVertexElement - reference to the current vVertexElement
1319 /// @param outputElt - reference to the current offset from simdvertex we're o
1320 /// @param compMask - component packing mask
1321 /// @param compCtrl - component control val
1322 /// @param vVertexElements[4] - vertex components to output
1323 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
1324 {
1325 // Unpack tuple args
1326 Value* (&vGatherResult)[2] = std::get<0>(args);
1327 Value* pVtxOut = std::get<1>(args);
1328 const Instruction::CastOps extendType = std::get<2>(args);
1329 const ConversionType conversionType = std::get<3>(args);
1330 uint32_t &currentVertexElement = std::get<4>(args);
1331 uint32_t &outputElt = std::get<5>(args);
1332 const ComponentEnable compMask = std::get<6>(args);
1333 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1334 Value* (&vVertexElements)[4] = std::get<8>(args);
1335
1336 // cast types
1337 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1338 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1339
1340 // have to do extra work for sign extending
1341 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
1342 (extendType == Instruction::CastOps::FPExt))
1343 {
1344 // is this PP float?
1345 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1346
1347 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1348 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1349
1350 // shuffle mask
1351 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1352 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1353 Value* vi128XY = nullptr;
1354 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1355 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1356 // after pshufb: group components together in each 128bit lane
1357 // 256i - 0 1 2 3 4 5 6 7
1358 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1359
1360 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1361 // after PERMD: move and pack xy components into each 128bit lane
1362 // 256i - 0 1 2 3 4 5 6 7
1363 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1364 }
1365
1366 // do the same for zw components
1367 Value* vi128ZW = nullptr;
1368 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1369 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1370 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1371 }
1372
1373 // init denormalize variables if needed
1374 Instruction::CastOps IntToFpCast;
1375 Value* conversionFactor;
1376
1377 switch (conversionType)
1378 {
1379 case CONVERT_NORMALIZED:
1380 IntToFpCast = Instruction::CastOps::SIToFP;
1381 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1382 break;
1383 case CONVERT_SSCALED:
1384 IntToFpCast = Instruction::CastOps::SIToFP;
1385 conversionFactor = VIMMED1((float)(1.0));
1386 break;
1387 case CONVERT_USCALED:
1388 SWR_ASSERT(0, "Type should not be sign extended!");
1389 conversionFactor = nullptr;
1390 break;
1391 default:
1392 SWR_ASSERT(conversionType == CONVERT_NONE);
1393 conversionFactor = nullptr;
1394 break;
1395 }
1396
1397 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1398 for(uint32_t i = 0; i < 4; i++){
1399 if(!isComponentEnabled(compMask, i)){
1400 continue;
1401 }
1402
1403 if(compCtrl[i] == ComponentControl::StoreSrc){
1404 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1405 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1406 // if x or y, use vi128XY permute result, else use vi128ZW
1407 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1408
1409 if(bFP) {
1410 // extract 128 bit lanes to sign extend each component
1411 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1412 }
1413 else {
1414 // extract 128 bit lanes to sign extend each component
1415 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1416
1417 // denormalize if needed
1418 if(conversionType != CONVERT_NONE){
1419 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1420 }
1421 }
1422 currentVertexElement++;
1423 }
1424 else{
1425 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1426 }
1427
1428 if(currentVertexElement > 3){
1429 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1430 // reset to the next vVertexElement to output
1431 currentVertexElement = 0;
1432 }
1433 }
1434
1435 }
1436 // else zero extend
1437 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
1438 {
1439 // pshufb masks for each component
1440 Value* vConstMask[2];
1441 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
1442 // x/z shuffle mask
1443 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1444 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1445 }
1446
1447 if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
1448 // y/w shuffle mask
1449 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1450 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1451 }
1452
1453 // init denormalize variables if needed
1454 Instruction::CastOps fpCast;
1455 Value* conversionFactor;
1456
1457 switch (conversionType)
1458 {
1459 case CONVERT_NORMALIZED:
1460 fpCast = Instruction::CastOps::UIToFP;
1461 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1462 break;
1463 case CONVERT_USCALED:
1464 fpCast = Instruction::CastOps::UIToFP;
1465 conversionFactor = VIMMED1((float)(1.0f));
1466 break;
1467 case CONVERT_SSCALED:
1468 SWR_ASSERT(0, "Type should not be zero extended!");
1469 conversionFactor = nullptr;
1470 break;
1471 default:
1472 SWR_ASSERT(conversionType == CONVERT_NONE);
1473 conversionFactor = nullptr;
1474 break;
1475 }
1476
1477 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1478 for(uint32_t i = 0; i < 4; i++){
1479 if(!isComponentEnabled(compMask, i)){
1480 continue;
1481 }
1482
1483 if(compCtrl[i] == ComponentControl::StoreSrc){
1484 // select correct constMask for x/z or y/w pshufb
1485 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1486 // if x or y, use vi128XY permute result, else use vi128ZW
1487 uint32_t selectedGather = (i < 2) ? 0 : 1;
1488
1489 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1490 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1491 // 256i - 0 1 2 3 4 5 6 7
1492 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1493
1494 // denormalize if needed
1495 if(conversionType != CONVERT_NONE){
1496 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1497 }
1498 currentVertexElement++;
1499 }
1500 else{
1501 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1502 }
1503
1504 if(currentVertexElement > 3){
1505 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1506 // reset to the next vVertexElement to output
1507 currentVertexElement = 0;
1508 }
1509 }
1510 }
1511 else
1512 {
1513 SWR_ASSERT(0, "Unsupported conversion type");
1514 }
1515 }
1516
1517 //////////////////////////////////////////////////////////////////////////
1518 /// @brief Output a simdvertex worth of elements to the current outputElt
1519 /// @param pVtxOut - base address of VIN output struct
1520 /// @param outputElt - simdvertex offset in VIN to write to
1521 /// @param numEltsToStore - number of simdvertex rows to write out
1522 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
1523 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
1524 {
1525 for(uint32_t c = 0; c < numEltsToStore; ++c)
1526 {
1527 // STORE expects FP32 x vWidth type, just bitcast if needed
1528 if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
1529 #if FETCH_DUMP_VERTEX
1530 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
1531 #endif
1532 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
1533 }
1534 #if FETCH_DUMP_VERTEX
1535 else
1536 {
1537 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
1538 }
1539 #endif
1540 // outputElt * 4 = offsetting by the size of a simdvertex
1541 // + c offsets to a 32bit x vWidth row within the current vertex
1542 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
1543 STORE(vVertexElements[c], dest);
1544 }
1545 }
1546
1547 //////////////////////////////////////////////////////////////////////////
1548 /// @brief Generates a constant vector of values based on the
1549 /// ComponentControl value
1550 /// @param ctrl - ComponentControl value
1551 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
1552 {
1553 switch(ctrl)
1554 {
1555 case NoStore: return VUNDEF_I();
1556 case Store0: return VIMMED1(0);
1557 case Store1Fp: return VIMMED1(1.0f);
1558 case Store1Int: return VIMMED1(1);
1559 case StoreSrc:
1560 default: SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
1561 }
1562 }
1563
1564 //////////////////////////////////////////////////////////////////////////
1565 /// @brief Returns the enable mask for the specified component.
1566 /// @param enableMask - enable bits
1567 /// @param component - component to check if enabled.
1568 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
1569 {
1570 switch (component)
1571 {
1572 // X
1573 case 0: return (enableMask & ComponentEnable::X);
1574 // Y
1575 case 1: return (enableMask & ComponentEnable::Y);
1576 // Z
1577 case 2: return (enableMask & ComponentEnable::Z);
1578 // W
1579 case 3: return (enableMask & ComponentEnable::W);
1580
1581 default: return false;
1582 }
1583 }
1584
1585
1586 //////////////////////////////////////////////////////////////////////////
1587 /// @brief JITs from fetch shader IR
1588 /// @param hJitMgr - JitManager handle
1589 /// @param func - LLVM function IR
1590 /// @return PFN_FETCH_FUNC - pointer to fetch code
1591 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
1592 {
1593 const llvm::Function* func = (const llvm::Function*)hFunc;
1594 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1595 PFN_FETCH_FUNC pfnFetch;
1596
1597 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
1598 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
1599 pJitMgr->mIsModuleFinalized = true;
1600
1601 #if defined(KNOB_SWRC_TRACING)
1602 char fName[1024];
1603 const char *funcName = func->getName().data();
1604 sprintf(fName, "%s.bin", funcName);
1605 FILE *fd = fopen(fName, "wb");
1606 fwrite((void *)pfnFetch, 1, 2048, fd);
1607 fclose(fd);
1608 #endif
1609
1610 return pfnFetch;
1611 }
1612
1613 //////////////////////////////////////////////////////////////////////////
1614 /// @brief JIT compiles fetch shader
1615 /// @param hJitMgr - JitManager handle
1616 /// @param state - fetch state to build function from
1617 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
1618 {
1619 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
1620
1621 pJitMgr->SetupNewModule();
1622
1623 FetchJit theJit(pJitMgr);
1624 HANDLE hFunc = theJit.Create(state);
1625
1626 return JitFetchFunc(hJitMgr, hFunc);
1627 }