aa3fca4c358814a0eb2b06e490ed39c0a8847e38
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "jit_api.h"
32 #include "fetch_jit.h"
33 #include "gen_state_llvm.h"
34 #include <sstream>
35 #include <tuple>
36
37 //#define FETCH_DUMP_VERTEX 1
38 using namespace llvm;
39 using namespace SwrJit;
40
41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43 enum ConversionType
44 {
45 CONVERT_NONE,
46 CONVERT_NORMALIZED,
47 CONVERT_USCALED,
48 CONVERT_SSCALED,
49 CONVERT_SFIXED,
50 };
51
52 //////////////////////////////////////////////////////////////////////////
53 /// Interface to Jitting a fetch shader
54 //////////////////////////////////////////////////////////////////////////
55 struct FetchJit : public Builder
56 {
57 FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
58
59 Function* Create(const FETCH_COMPILE_STATE& fetchState);
60 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
61 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
62 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
63
64 // package up Shuffle*bpcGatherd args into a tuple for convenience
65 typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
66 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
67 const uint32_t(&)[4]> Shuffle8bpcArgs;
68 #if USE_SIMD16_SHADERS
69 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
70 #else
71 void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
72 #endif
73
74 typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
75 uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
76 #if USE_SIMD16_SHADERS
77 void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
78 #else
79 void Shuffle16bpcGather(Shuffle16bpcArgs &args);
80 #endif
81
82 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
83 #if USE_SIMD16_BUILDER
84 void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
85 #endif
86
87 #if USE_SIMD16_SHADERS
88 Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
89 #else
90 Value* GenerateCompCtrlVector(const ComponentControl ctrl);
91 #endif
92
93 void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
94 #if USE_SIMD16_SHADERS
95 #define USE_SIMD16_GATHERS 0
96
97 #if USE_SIMD16_GATHERS
98 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
99 #else
100 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
101 #endif
102 #else
103 void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
104 #endif
105
106 bool IsOddFormat(SWR_FORMAT format);
107 bool IsUniformFormat(SWR_FORMAT format);
108 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
109 void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
110 void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
111
112 Value* mpFetchInfo;
113 };
114
115 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
116 {
117 std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
118 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
119
120 Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
121 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
122
123 fetch->getParent()->setModuleIdentifier(fetch->getName());
124
125 IRB()->SetInsertPoint(entry);
126
127 auto argitr = fetch->arg_begin();
128
129 // Fetch shader arguments
130 mpFetchInfo = &*argitr; ++argitr;
131 mpFetchInfo->setName("fetchInfo");
132 Value* pVtxOut = &*argitr;
133 pVtxOut->setName("vtxOutput");
134 // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
135 // index 0(just the pointer to the simdvertex structure
136 // index 1(which element of the simdvertex structure to offset to(in this case 0)
137 // so the indices being i32's doesn't matter
138 // TODO: generated this GEP with a VECTOR structure type so this makes sense
139 std::vector<Value*> vtxInputIndices(2, C(0));
140 // GEP
141 pVtxOut = GEP(pVtxOut, C(0));
142 #if USE_SIMD16_SHADERS
143 #if 0// USE_SIMD16_BUILDER
144 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
145 #else
146 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
147 #endif
148 #else
149 pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
150 #endif
151
152 // SWR_FETCH_CONTEXT::pStreams
153 Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
154 streams->setName("pStreams");
155
156 // SWR_FETCH_CONTEXT::pIndices
157 Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
158 indices->setName("pIndices");
159
160 // SWR_FETCH_CONTEXT::pLastIndex
161 Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
162 pLastIndex->setName("pLastIndex");
163
164
165 Value* vIndices;
166 #if USE_SIMD16_SHADERS
167 Value* indices2;
168 Value* vIndices2;
169 #endif
170 switch(fetchState.indexType)
171 {
172 case R8_UINT:
173 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
174 #if USE_SIMD16_SHADERS
175 indices2 = GEP(indices, C(8));
176 #endif
177 if(fetchState.bDisableIndexOOBCheck)
178 {
179 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
180 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
181 #if USE_SIMD16_SHADERS
182 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
183 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
184 #endif
185 }
186 else
187 {
188 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
189 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
190 #if USE_SIMD16_SHADERS
191 pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
192 vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
193 #endif
194 }
195 break;
196 case R16_UINT:
197 indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
198 #if USE_SIMD16_SHADERS
199 indices2 = GEP(indices, C(8));
200 #endif
201 if(fetchState.bDisableIndexOOBCheck)
202 {
203 vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
204 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
205 #if USE_SIMD16_SHADERS
206 vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
207 vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
208 #endif
209 }
210 else
211 {
212 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
213 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
214 #if USE_SIMD16_SHADERS
215 pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
216 vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
217 #endif
218 }
219 break;
220 case R32_UINT:
221 #if USE_SIMD16_SHADERS
222 indices2 = GEP(indices, C(8));
223 #endif
224 (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
225 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
226 #if USE_SIMD16_SHADERS
227 (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
228 : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
229 #endif
230 break; // incoming type is already 32bit int
231 default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
232 }
233
234 if(fetchState.bForceSequentialAccessEnable)
235 {
236 Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
237
238 // VertexData buffers are accessed sequentially, the index is equal to the vertex number
239 vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
240 vIndices = ADD(vIndices, pOffsets);
241 #if USE_SIMD16_SHADERS
242 vIndices2 = ADD(vIndices, VIMMED1(8));
243 #endif
244 }
245
246 Value* vVertexId = vIndices;
247 #if USE_SIMD16_SHADERS
248 Value* vVertexId2 = vIndices2;
249 #endif
250 if (fetchState.bVertexIDOffsetEnable)
251 {
252 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
253 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
254 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
255 vVertexId = ADD(vIndices, vBaseVertex);
256 vVertexId = ADD(vVertexId, vStartVertex);
257 #if USE_SIMD16_SHADERS
258 vVertexId2 = ADD(vIndices2, vBaseVertex);
259 vVertexId2 = ADD(vVertexId2, vStartVertex);
260 #endif
261 }
262
263 // store out vertex IDs
264 STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
265 #if USE_SIMD16_SHADERS
266 STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
267 #endif
268
269 // store out cut mask if enabled
270 if (fetchState.bEnableCutIndex)
271 {
272 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
273 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
274 STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
275 #if USE_SIMD16_SHADERS
276 Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
277 STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
278 #endif
279 }
280
281 // Fetch attributes from memory and output to a simdvertex struct
282 // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
283 #if USE_SIMD16_SHADERS
284 if (fetchState.bDisableVGATHER)
285 {
286 JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
287 JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
288 }
289 else
290 {
291 #if USE_SIMD16_GATHERS
292 JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false);
293 #else
294 JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
295 JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
296 #endif
297 }
298 #else
299 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
300 : JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
301 #endif
302
303 RET_VOID();
304
305 JitManager::DumpToFile(fetch, "src");
306
307 #if defined(_DEBUG)
308 verifyFunction(*fetch);
309 #endif
310
311 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
312
313 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
314 setupPasses.add(createBreakCriticalEdgesPass());
315 setupPasses.add(createCFGSimplificationPass());
316 setupPasses.add(createEarlyCSEPass());
317 setupPasses.add(createPromoteMemoryToRegisterPass());
318
319 setupPasses.run(*fetch);
320
321 JitManager::DumpToFile(fetch, "se");
322
323 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
324
325 ///@todo Haven't touched these either. Need to remove some of these and add others.
326 optPasses.add(createCFGSimplificationPass());
327 optPasses.add(createEarlyCSEPass());
328 optPasses.add(createInstructionCombiningPass());
329 optPasses.add(createInstructionSimplifierPass());
330 optPasses.add(createConstantPropagationPass());
331 optPasses.add(createSCCPPass());
332 optPasses.add(createAggressiveDCEPass());
333
334 optPasses.run(*fetch);
335 optPasses.run(*fetch);
336
337 JitManager::DumpToFile(fetch, "opt");
338
339 return fetch;
340 }
341
342 //////////////////////////////////////////////////////////////////////////
343 /// @brief Loads attributes from memory using LOADs, shuffling the
344 /// components into SOA form.
345 /// *Note* currently does not support component control,
346 /// component packing, instancing
347 /// @param fetchState - info about attributes to be fetched from memory
348 /// @param streams - value pointer to the current vertex stream
349 /// @param vIndices - vector value of indices to load
350 /// @param pVtxOut - value pointer to output simdvertex struct
351 void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut)
352 {
353 // Zack shuffles; a variant of the Charleston.
354
355 std::vector<Value*> vectors(16);
356 std::vector<Constant*> pMask(mVWidth);
357 for(uint32_t i = 0; i < mVWidth; ++i)
358 {
359 pMask[i] = (C(i < 4 ? i : 4));
360 }
361 Constant* promoteMask = ConstantVector::get(pMask);
362 Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
363
364 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
365 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
366 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
367 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
368 curInstance->setName("curInstance");
369
370 for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
371 {
372 Value* elements[4] = {0};
373 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
374 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
375 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices.");
376 uint32_t numComponents = info.numComps;
377 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
378
379 // load path doesn't support component packing
380 SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing.");
381
382 vectors.clear();
383
384 if (fetchState.bInstanceIDOffsetEnable)
385 {
386 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
387 }
388
389 Value *vCurIndices;
390 Value *startOffset;
391 if(ied.InstanceEnable)
392 {
393 Value* stepRate = C(ied.InstanceAdvancementState);
394
395 // prevent a div by 0 for 0 step rate
396 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
397 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
398
399 // calc the current offset into instanced data buffer
400 Value* calcInstance = UDIV(curInstance, stepRate);
401
402 // if step rate is 0, every instance gets instance 0
403 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
404
405 vCurIndices = VBROADCAST(calcInstance);
406
407 startOffset = startInstance;
408 }
409 else if (ied.InstanceStrideEnable)
410 {
411 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
412 }
413 else
414 {
415 // offset indices by baseVertex
416 vCurIndices = ADD(vIndices, vBaseVertex);
417
418 startOffset = startVertex;
419 }
420
421 // load SWR_VERTEX_BUFFER_STATE::pData
422 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
423
424 // load SWR_VERTEX_BUFFER_STATE::pitch
425 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
426 stride = Z_EXT(stride, mInt64Ty);
427
428 // load SWR_VERTEX_BUFFER_STATE::size
429 Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size});
430 size = Z_EXT(size, mInt64Ty);
431
432 Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride);
433
434 Value *minVertex = NULL;
435 Value *minVertexOffset = NULL;
436 if (fetchState.bPartialVertexBuffer) {
437 // fetch min index for low bounds checking
438 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
439 minVertex = LOAD(minVertex);
440 if (!fetchState.bDisableIndexOOBCheck) {
441 minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride);
442 }
443 }
444
445 // Load from the stream.
446 for(uint32_t lane = 0; lane < mVWidth; ++lane)
447 {
448 // Get index
449 Value* index = VEXTRACT(vCurIndices, C(lane));
450
451 if (fetchState.bPartialVertexBuffer) {
452 // clamp below minvertex
453 Value *isBelowMin = ICMP_SLT(index, minVertex);
454 index = SELECT(isBelowMin, minVertex, index);
455 }
456
457 index = Z_EXT(index, mInt64Ty);
458
459 Value* offset = MUL(index, stride);
460 offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
461 offset = ADD(offset, startVertexOffset);
462
463 if (!fetchState.bDisableIndexOOBCheck) {
464 // check for out of bound access, including partial OOB, and replace them with minVertex
465 Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
466 Value *oob = ICMP_ULE(endOffset, size);
467 if (fetchState.bPartialVertexBuffer) {
468 offset = SELECT(oob, offset, minVertexOffset);
469 } else {
470 offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
471 }
472 }
473
474 Value* pointer = GEP(stream, offset);
475 // We use a full-lane, but don't actually care.
476 Value* vptr = 0;
477
478 // get a pointer to a 4 component attrib in default address space
479 switch(bpc)
480 {
481 case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
482 case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
483 case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
484 default: SWR_INVALID("Unsupported underlying bpp!");
485 }
486
487 // load 4 components of attribute
488 Value* vec = ALIGNED_LOAD(vptr, 1, false);
489
490 // Convert To FP32 internally
491 switch(info.type[0])
492 {
493 case SWR_TYPE_UNORM:
494 switch(bpc)
495 {
496 case 8:
497 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
498 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
499 break;
500 case 16:
501 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
502 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
503 break;
504 default:
505 SWR_INVALID("Unsupported underlying type!");
506 break;
507 }
508 break;
509 case SWR_TYPE_SNORM:
510 switch(bpc)
511 {
512 case 8:
513 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
514 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
515 break;
516 case 16:
517 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
518 vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
519 break;
520 default:
521 SWR_INVALID("Unsupported underlying type!");
522 break;
523 }
524 break;
525 case SWR_TYPE_UINT:
526 // Zero extend uint32_t types.
527 switch(bpc)
528 {
529 case 8:
530 case 16:
531 vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
532 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
533 break;
534 case 32:
535 break; // Pass through unchanged.
536 default:
537 SWR_INVALID("Unsupported underlying type!");
538 break;
539 }
540 break;
541 case SWR_TYPE_SINT:
542 // Sign extend SINT types.
543 switch(bpc)
544 {
545 case 8:
546 case 16:
547 vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
548 vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
549 break;
550 case 32:
551 break; // Pass through unchanged.
552 default:
553 SWR_INVALID("Unsupported underlying type!");
554 break;
555 }
556 break;
557 case SWR_TYPE_FLOAT:
558 switch(bpc)
559 {
560 case 32:
561 break; // Pass through unchanged.
562 default:
563 SWR_INVALID("Unsupported underlying type!");
564 }
565 break;
566 case SWR_TYPE_USCALED:
567 vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
568 break;
569 case SWR_TYPE_SSCALED:
570 vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
571 break;
572 case SWR_TYPE_SFIXED:
573 vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f)));
574 break;
575 case SWR_TYPE_UNKNOWN:
576 case SWR_TYPE_UNUSED:
577 SWR_INVALID("Unsupported type %d!", info.type[0]);
578 }
579
580 // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
581 // uwvec: 4 x F32, undef value
582 Value* wvec = VSHUFFLE(vec, uwvec, promoteMask);
583 vectors.push_back(wvec);
584 }
585
586 std::vector<Constant*> v01Mask(mVWidth);
587 std::vector<Constant*> v23Mask(mVWidth);
588 std::vector<Constant*> v02Mask(mVWidth);
589 std::vector<Constant*> v13Mask(mVWidth);
590
591 // Concatenate the vectors together.
592 elements[0] = VUNDEF_F();
593 elements[1] = VUNDEF_F();
594 elements[2] = VUNDEF_F();
595 elements[3] = VUNDEF_F();
596 for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
597 {
598 v01Mask[4 * b + 0] = C(0 + 4 * b);
599 v01Mask[4 * b + 1] = C(1 + 4 * b);
600 v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
601 v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
602
603 v23Mask[4 * b + 0] = C(2 + 4 * b);
604 v23Mask[4 * b + 1] = C(3 + 4 * b);
605 v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
606 v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
607
608 v02Mask[4 * b + 0] = C(0 + 4 * b);
609 v02Mask[4 * b + 1] = C(2 + 4 * b);
610 v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
611 v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
612
613 v13Mask[4 * b + 0] = C(1 + 4 * b);
614 v13Mask[4 * b + 1] = C(3 + 4 * b);
615 v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
616 v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
617
618 std::vector<Constant*> iMask(mVWidth);
619 for(uint32_t i = 0; i < mVWidth; ++i)
620 {
621 if(((4 * b) <= i) && (i < (4 * (b + 1))))
622 {
623 iMask[i] = C(i % 4 + mVWidth);
624 }
625 else
626 {
627 iMask[i] = C(i);
628 }
629 }
630 Constant* insertMask = ConstantVector::get(iMask);
631 elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
632 elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
633 elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
634 elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
635 }
636
637 Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
638 Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
639 Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
640 Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
641 elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
642 elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
643 elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
644 elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
645
646 switch(numComponents + 1)
647 {
648 case 1: elements[0] = VIMMED1(0.0f);
649 case 2: elements[1] = VIMMED1(0.0f);
650 case 3: elements[2] = VIMMED1(0.0f);
651 case 4: elements[3] = VIMMED1(1.0f);
652 }
653
654 for(uint32_t c = 0; c < 4; ++c)
655 {
656 #if USE_SIMD16_SHADERS
657 Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
658 #else
659 Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
660 #endif
661 STORE(elements[c], dest);
662 }
663 }
664 }
665
666 // returns true for odd formats that require special state.gather handling
667 bool FetchJit::IsOddFormat(SWR_FORMAT format)
668 {
669 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
670 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
671 {
672 return true;
673 }
674 return false;
675 }
676
677 // format is uniform if all components are the same size and type
678 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
679 {
680 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
681 uint32_t bpc0 = info.bpc[0];
682 uint32_t type0 = info.type[0];
683
684 for (uint32_t c = 1; c < info.numComps; ++c)
685 {
686 if (bpc0 != info.bpc[c] || type0 != info.type[c])
687 {
688 return false;
689 }
690 }
691 return true;
692 }
693
694 // unpacks components based on format
695 // foreach component in the pixel
696 // mask off everything but this component
697 // shift component to LSB
698 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
699 {
700 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
701
702 uint32_t bitOffset = 0;
703 for (uint32_t c = 0; c < info.numComps; ++c)
704 {
705 uint32_t swizzledIndex = info.swizzle[c];
706 uint32_t compBits = info.bpc[c];
707 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
708 Value* comp = AND(vInput, bitmask);
709 comp = LSHR(comp, bitOffset);
710
711 result[swizzledIndex] = comp;
712 bitOffset += compBits;
713 }
714 }
715
716 // gather for odd component size formats
717 // gather SIMD full pixels per lane then shift/mask to move each component to their
718 // own vector
719 void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
720 {
721 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
722
723 // only works if pixel size is <= 32bits
724 SWR_ASSERT(info.bpp <= 32);
725
726 Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
727
728 for (uint32_t comp = 0; comp < 4; ++comp)
729 {
730 pResult[comp] = VIMMED1((int)info.defaults[comp]);
731 }
732
733 UnpackComponents(format, pGather, pResult);
734
735 // cast to fp32
736 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
737 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
738 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
739 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
740 }
741
742 void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
743 {
744 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
745
746 for (uint32_t c = 0; c < info.numComps; ++c)
747 {
748 uint32_t compIndex = info.swizzle[c];
749
750 // skip any conversion on UNUSED components
751 if (info.type[c] == SWR_TYPE_UNUSED)
752 {
753 continue;
754 }
755
756 if (info.isNormalized[c])
757 {
758 if (info.type[c] == SWR_TYPE_SNORM)
759 {
760 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
761
762 /// result = c * (1.0f / (2^(n-1) - 1);
763 uint32_t n = info.bpc[c];
764 uint32_t pow2 = 1 << (n - 1);
765 float scale = 1.0f / (float)(pow2 - 1);
766 Value *vScale = VIMMED1(scale);
767 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
768 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
769 texels[compIndex] = FMUL(texels[compIndex], vScale);
770 }
771 else
772 {
773 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
774
775 /// result = c * (1.0f / (2^n - 1))
776 uint32_t n = info.bpc[c];
777 uint32_t pow2 = 1 << n;
778 // special case 24bit unorm format, which requires a full divide to meet ULP requirement
779 if (n == 24)
780 {
781 float scale = (float)(pow2 - 1);
782 Value* vScale = VIMMED1(scale);
783 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
784 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
785 texels[compIndex] = FDIV(texels[compIndex], vScale);
786 }
787 else
788 {
789 float scale = 1.0f / (float)(pow2 - 1);
790 Value *vScale = VIMMED1(scale);
791 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
792 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
793 texels[compIndex] = FMUL(texels[compIndex], vScale);
794 }
795 }
796 continue;
797 }
798 }
799 }
800
801 //////////////////////////////////////////////////////////////////////////
802 /// @brief Loads attributes from memory using AVX2 GATHER(s)
803 /// @param fetchState - info about attributes to be fetched from memory
804 /// @param streams - value pointer to the current vertex stream
805 /// @param vIndices - vector value of indices to gather
806 /// @param pVtxOut - value pointer to output simdvertex struct
807 #if USE_SIMD16_SHADERS
808 #if USE_SIMD16_GATHERS
809 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
810 Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2)
811 #else
812 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
813 Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
814 #endif
815 #else
816 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
817 Value* streams, Value* vIndices, Value* pVtxOut)
818 #endif
819 {
820 uint32_t currentVertexElement = 0;
821 uint32_t outputElt = 0;
822 Value* vVertexElements[4];
823 #if USE_SIMD16_GATHERS
824 Value* vVertexElements2[4];
825 #endif
826
827 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
828 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
829 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
830 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
831 curInstance->setName("curInstance");
832
833 for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
834 {
835 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
836
837 // skip element if all components are disabled
838 if (ied.ComponentPacking == ComponentEnable::NONE)
839 {
840 continue;
841 }
842
843 const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
844 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
845 uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
846
847 Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
848
849 // VGATHER* takes an *i8 src pointer
850 Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
851
852 Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
853 Value *vStride = VBROADCAST(stride);
854
855 // max vertex index that is fully in bounds
856 Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
857 maxVertex = LOAD(maxVertex);
858
859 Value *minVertex = NULL;
860 if (fetchState.bPartialVertexBuffer)
861 {
862 // min vertex index for low bounds OOB checking
863 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
864 minVertex = LOAD(minVertex);
865 }
866
867 if (fetchState.bInstanceIDOffsetEnable)
868 {
869 // the InstanceID (curInstance) value is offset by StartInstanceLocation
870 curInstance = ADD(curInstance, startInstance);
871 }
872
873 Value *vCurIndices;
874 #if USE_SIMD16_GATHERS
875 Value *vCurIndices2;
876 #endif
877 Value *startOffset;
878 Value *vInstanceStride = VIMMED1(0);
879
880 if (ied.InstanceEnable)
881 {
882 Value* stepRate = C(ied.InstanceAdvancementState);
883
884 // prevent a div by 0 for 0 step rate
885 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
886 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
887
888 // calc the current offset into instanced data buffer
889 Value* calcInstance = UDIV(curInstance, stepRate);
890
891 // if step rate is 0, every instance gets instance 0
892 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
893
894 vCurIndices = VBROADCAST(calcInstance);
895 #if USE_SIMD16_GATHERS
896 vCurIndices2 = VBROADCAST(calcInstance);
897 #endif
898
899 startOffset = startInstance;
900 }
901 else if (ied.InstanceStrideEnable)
902 {
903 // grab the instance advancement state, determines stride in bytes from one instance to the next
904 Value* stepRate = C(ied.InstanceAdvancementState);
905 vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
906
907 // offset indices by baseVertex
908 vCurIndices = ADD(vIndices, vBaseVertex);
909 #if USE_SIMD16_GATHERS
910 vCurIndices2 = ADD(vIndices2, vBaseVertex);
911 #endif
912
913 startOffset = startVertex;
914 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
915 }
916 else
917 {
918 // offset indices by baseVertex
919 vCurIndices = ADD(vIndices, vBaseVertex);
920 #if USE_SIMD16_GATHERS
921 vCurIndices2 = ADD(vIndices2, vBaseVertex);
922 #endif
923
924 startOffset = startVertex;
925 }
926
927 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
928 // do 64bit address offset calculations.
929
930 // calculate byte offset to the start of the VB
931 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
932 pStreamBase = GEP(pStreamBase, baseOffset);
933
934 // if we have a start offset, subtract from max vertex. Used for OOB check
935 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
936 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
937 // if we have a negative value, we're already OOB. clamp at 0.
938 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
939
940 if (fetchState.bPartialVertexBuffer)
941 {
942 // similary for min vertex
943 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
944 Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
945 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
946 }
947
948 // Load the in bounds size of a partially valid vertex
949 Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
950 partialInboundsSize = LOAD(partialInboundsSize);
951 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
952 Value* vBpp = VBROADCAST(C(info.Bpp));
953 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
954
955 // is the element is <= the partially valid size
956 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
957
958 #if USE_SIMD16_GATHERS
959 // override cur indices with 0 if pitch is 0
960 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
961 vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
962
963 // are vertices partially OOB?
964 Value* vMaxVertex = VBROADCAST(maxVertex);
965 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
966 Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex);
967
968 // are vertices fully in bounds?
969 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
970 Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex);
971
972 Value *vGatherMask;
973 Value *vGatherMask2;
974 if (fetchState.bPartialVertexBuffer)
975 {
976 // are vertices below minVertex limit?
977 Value *vMinVertex = VBROADCAST(minVertex);
978 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
979 Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex);
980
981 // only fetch lanes that pass both tests
982 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
983 vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
984 }
985 else
986 {
987 vGatherMask = vMaxGatherMask;
988 vGatherMask2 = vMaxGatherMask2;
989 }
990
991 // blend in any partially OOB indices that have valid elements
992 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
993 vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2);
994 Value *pMask = vGatherMask;
995 Value *pMask2 = vGatherMask2;
996 vGatherMask = VMASK(vGatherMask);
997 vGatherMask2 = VMASK(vGatherMask2);
998
999 // calculate the actual offsets into the VB
1000 Value* vOffsets = MUL(vCurIndices, vStride);
1001 vOffsets = ADD(vOffsets, vAlignmentOffsets);
1002
1003 Value* vOffsets2 = MUL(vCurIndices2, vStride);
1004 vOffsets2 = ADD(vOffsets2, vAlignmentOffsets);
1005
1006 // if instance stride enable is:
1007 // true - add product of the instanceID and advancement state to the offst into the VB
1008 // false - value of vInstanceStride has been initialialized to zero
1009 vOffsets = ADD(vOffsets, vInstanceStride);
1010 vOffsets2 = ADD(vOffsets2, vInstanceStride);
1011
1012 #else
1013 // override cur indices with 0 if pitch is 0
1014 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
1015 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
1016
1017 // are vertices partially OOB?
1018 Value* vMaxVertex = VBROADCAST(maxVertex);
1019 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
1020
1021 // are vertices fully in bounds?
1022 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
1023
1024 Value *vGatherMask;
1025 if (fetchState.bPartialVertexBuffer)
1026 {
1027 // are vertices below minVertex limit?
1028 Value *vMinVertex = VBROADCAST(minVertex);
1029 Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
1030
1031 // only fetch lanes that pass both tests
1032 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
1033 }
1034 else
1035 {
1036 vGatherMask = vMaxGatherMask;
1037 }
1038
1039 // blend in any partially OOB indices that have valid elements
1040 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
1041 Value* pMask = vGatherMask;
1042 vGatherMask = VMASK(vGatherMask);
1043
1044 // calculate the actual offsets into the VB
1045 Value* vOffsets = MUL(vCurIndices, vStride);
1046 vOffsets = ADD(vOffsets, vAlignmentOffsets);
1047
1048 // if instance stride enable is:
1049 // true - add product of the instanceID and advancement state to the offst into the VB
1050 // false - value of vInstanceStride has been initialialized to zero
1051 vOffsets = ADD(vOffsets, vInstanceStride);
1052
1053 #endif
1054 // Packing and component control
1055 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
1056 const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1,
1057 (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3};
1058
1059 // Special gather/conversion for formats without equal component sizes
1060 if (IsOddFormat((SWR_FORMAT)ied.Format))
1061 {
1062 #if USE_SIMD16_GATHERS
1063 Value *pResults[4];
1064 Value *pResults2[4];
1065 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1066 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2);
1067 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1068 ConvertFormat((SWR_FORMAT)ied.Format, pResults2);
1069
1070 for (uint32_t c = 0; c < 4; c += 1)
1071 {
1072 if (isComponentEnabled(compMask, c))
1073 {
1074 vVertexElements[currentVertexElement] = pResults[c];
1075 vVertexElements2[currentVertexElement] = pResults2[c];
1076 currentVertexElement++;
1077
1078 if (currentVertexElement > 3)
1079 {
1080 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1081 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1082
1083 outputElt += 1;
1084
1085 // reset to the next vVertexElement to output
1086 currentVertexElement = 0;
1087 }
1088 }
1089 }
1090 #else
1091 Value* pResults[4];
1092 CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
1093 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
1094
1095 for (uint32_t c = 0; c < 4; ++c)
1096 {
1097 if (isComponentEnabled(compMask, c))
1098 {
1099 vVertexElements[currentVertexElement++] = pResults[c];
1100 if (currentVertexElement > 3)
1101 {
1102 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1103 // reset to the next vVertexElement to output
1104 currentVertexElement = 0;
1105 }
1106 }
1107 }
1108 #endif
1109 }
1110 else if(info.type[0] == SWR_TYPE_FLOAT)
1111 {
1112 ///@todo: support 64 bit vb accesses
1113 Value* gatherSrc = VIMMED1(0.0f);
1114 #if USE_SIMD16_GATHERS
1115 Value* gatherSrc2 = VIMMED1(0.0f);
1116 #endif
1117
1118 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1119 "Unsupported format for standard gather fetch.");
1120
1121 // Gather components from memory to store in a simdvertex structure
1122 switch (bpc)
1123 {
1124 case 16:
1125 {
1126 #if USE_SIMD16_GATHERS
1127 Value* vGatherResult[2];
1128 Value* vGatherResult2[2];
1129 Value *vMask;
1130 Value *vMask2;
1131
1132 // if we have at least one component out of x or y to fetch
1133 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1134 {
1135 // save mask as it is zero'd out after each gather
1136 vMask = vGatherMask;
1137 vMask2 = vGatherMask2;
1138
1139 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1140 vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1141 // e.g. result of first 8x32bit integer gather for 16bit components
1142 // 256i - 0 1 2 3 4 5 6 7
1143 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1144 //
1145 }
1146
1147 // if we have at least one component out of z or w to fetch
1148 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1149 {
1150 // offset base to the next components(zw) in the vertex to gather
1151 pStreamBase = GEP(pStreamBase, C((char)4));
1152 vMask = vGatherMask;
1153 vMask2 = vGatherMask2;
1154
1155 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1156 vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1157 // e.g. result of second 8x32bit integer gather for 16bit components
1158 // 256i - 0 1 2 3 4 5 6 7
1159 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1160 //
1161 }
1162
1163
1164 // if we have at least one component to shuffle into place
1165 if (compMask)
1166 {
1167 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1168 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1169 Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
1170 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1171
1172 // Shuffle gathered components into place in simdvertex struct
1173 Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
1174 Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
1175 }
1176 #else
1177 Value* vGatherResult[2];
1178 Value *vMask;
1179
1180 // if we have at least one component out of x or y to fetch
1181 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1182 // save mask as it is zero'd out after each gather
1183 vMask = vGatherMask;
1184
1185 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1186 // e.g. result of first 8x32bit integer gather for 16bit components
1187 // 256i - 0 1 2 3 4 5 6 7
1188 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1189 //
1190 }
1191
1192 // if we have at least one component out of z or w to fetch
1193 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1194 // offset base to the next components(zw) in the vertex to gather
1195 pStreamBase = GEP(pStreamBase, C((char)4));
1196 vMask = vGatherMask;
1197
1198 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
1199 // e.g. result of second 8x32bit integer gather for 16bit components
1200 // 256i - 0 1 2 3 4 5 6 7
1201 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1202 //
1203 }
1204
1205 // if we have at least one component to shuffle into place
1206 if(compMask){
1207 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
1208 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1209
1210 // Shuffle gathered components into place in simdvertex struct
1211 #if USE_SIMD16_SHADERS
1212 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
1213 #else
1214 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1215 #endif
1216 }
1217 #endif
1218 }
1219 break;
1220 case 32:
1221 {
1222 for (uint32_t i = 0; i < 4; i += 1)
1223 {
1224 #if USE_SIMD16_GATHERS
1225 if (isComponentEnabled(compMask, i))
1226 {
1227 // if we need to gather the component
1228 if (compCtrl[i] == StoreSrc)
1229 {
1230 // save mask as it is zero'd out after each gather
1231 Value *vMask = vGatherMask;
1232 Value *vMask2 = vGatherMask2;
1233
1234 // Gather a SIMD of vertices
1235 // APIs allow a 4GB range for offsets
1236 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1237 // But, we know that elements must be aligned for FETCH. :)
1238 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1239 Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
1240 Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
1241 vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2);
1242 vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, 2);
1243
1244 currentVertexElement += 1;
1245 }
1246 else
1247 {
1248 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1249 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1250
1251 currentVertexElement += 1;
1252 }
1253
1254 if (currentVertexElement > 3)
1255 {
1256 #if USE_SIMD16_BUILDER
1257 Value *pVtxSrc2[4];
1258
1259 // pack adjacent pairs of SIMD8s into SIMD16s
1260 for (uint32_t i = 0; i < 4; i += 1)
1261 {
1262 pVtxSrc2[i] = VUNDEF2_F();
1263
1264 pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements[i], 0);
1265 pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements2[i], 1);
1266 }
1267
1268 // store SIMD16s
1269 Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
1270 StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
1271
1272 #else
1273 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1274 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1275
1276 #endif
1277 outputElt += 1;
1278
1279 // reset to the next vVertexElement to output
1280 currentVertexElement = 0;
1281 }
1282 }
1283
1284 // offset base to the next component in the vertex to gather
1285 pStreamBase = GEP(pStreamBase, C((char)4));
1286 #else
1287 if (isComponentEnabled(compMask, i))
1288 {
1289 // if we need to gather the component
1290 if (compCtrl[i] == StoreSrc)
1291 {
1292 // save mask as it is zero'd out after each gather
1293 Value *vMask = vGatherMask;
1294
1295 // Gather a SIMD of vertices
1296 // APIs allow a 4GB range for offsets
1297 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
1298 // But, we know that elements must be aligned for FETCH. :)
1299 // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
1300 Value* vShiftedOffsets = VPSRLI(vOffsets, C(1));
1301 vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2);
1302 }
1303 else
1304 {
1305 #if USE_SIMD16_SHADERS
1306 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1307 #else
1308 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1309 #endif
1310 }
1311
1312 if (currentVertexElement > 3)
1313 {
1314 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1315 // reset to the next vVertexElement to output
1316 currentVertexElement = 0;
1317 }
1318 }
1319
1320 // offset base to the next component in the vertex to gather
1321 pStreamBase = GEP(pStreamBase, C((char)4));
1322 #endif
1323 }
1324 }
1325 break;
1326 case 64:
1327 {
1328 for (uint32_t i = 0; i < 4; i += 1)
1329 {
1330 #if USE_SIMD16_GATHERS
1331 if (isComponentEnabled(compMask, i))
1332 {
1333 // if we need to gather the component
1334 if (compCtrl[i] == StoreSrc)
1335 {
1336 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1337 Value *vMaskLo2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
1338 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1339 Value *vMaskHi2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
1340 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1341 vMaskLo2 = S_EXT(vMaskLo2, VectorType::get(mInt64Ty, 4));
1342 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1343 vMaskHi2 = S_EXT(vMaskHi2, VectorType::get(mInt64Ty, 4));
1344 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1345 vMaskLo2 = BITCAST(vMaskLo2, VectorType::get(mDoubleTy, 4));
1346 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1347 vMaskHi2 = BITCAST(vMaskHi2, VectorType::get(mDoubleTy, 4));
1348
1349 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1350 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0));
1351 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1352 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1));
1353
1354 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1355
1356 Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
1357 Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
1358 Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
1359 Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
1360
1361 pGatherLo = VCVTPD2PS(pGatherLo);
1362 pGatherLo2 = VCVTPD2PS(pGatherLo2);
1363 pGatherHi = VCVTPD2PS(pGatherHi);
1364 pGatherHi2 = VCVTPD2PS(pGatherHi2);
1365
1366 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1367 Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
1368
1369 vVertexElements[currentVertexElement] = pGather;
1370 vVertexElements2[currentVertexElement] = pGather2;
1371
1372 currentVertexElement += 1;
1373 }
1374 else
1375 {
1376 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1377 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1378
1379 currentVertexElement += 1;
1380 }
1381
1382 if (currentVertexElement > 3)
1383 {
1384 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1385 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1386
1387 outputElt += 1;
1388
1389 // reset to the next vVertexElement to output
1390 currentVertexElement = 0;
1391 }
1392 }
1393
1394 // offset base to the next component in the vertex to gather
1395 pStreamBase = GEP(pStreamBase, C((char)8));
1396 #else
1397 if (isComponentEnabled(compMask, i))
1398 {
1399 // if we need to gather the component
1400 if (compCtrl[i] == StoreSrc)
1401 {
1402 Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
1403 Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
1404 vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4));
1405 vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4));
1406 vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4));
1407 vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4));
1408
1409 Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
1410 Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
1411
1412 Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
1413
1414 Value* pGatherLo = GATHERPD(vZeroDouble,
1415 pStreamBase, vOffsetsLo, vMaskLo);
1416 Value* pGatherHi = GATHERPD(vZeroDouble,
1417 pStreamBase, vOffsetsHi, vMaskHi);
1418
1419 pGatherLo = VCVTPD2PS(pGatherLo);
1420 pGatherHi = VCVTPD2PS(pGatherHi);
1421
1422 Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
1423
1424 vVertexElements[currentVertexElement++] = pGather;
1425 }
1426 else
1427 {
1428 #if USE_SIMD16_SHADERS
1429 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1430 #else
1431 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1432 #endif
1433 }
1434
1435 if (currentVertexElement > 3)
1436 {
1437 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1438 // reset to the next vVertexElement to output
1439 currentVertexElement = 0;
1440 }
1441 }
1442
1443 // offset base to the next component in the vertex to gather
1444 pStreamBase = GEP(pStreamBase, C((char)8));
1445 #endif
1446 }
1447 }
1448 break;
1449 default:
1450 SWR_INVALID("Tried to fetch invalid FP format");
1451 break;
1452 }
1453 }
1454 else
1455 {
1456 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
1457 ConversionType conversionType = CONVERT_NONE;
1458
1459 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
1460 "Unsupported format for standard gather fetch.");
1461
1462 switch(info.type[0])
1463 {
1464 case SWR_TYPE_UNORM:
1465 conversionType = CONVERT_NORMALIZED;
1466 case SWR_TYPE_UINT:
1467 extendCastType = Instruction::CastOps::ZExt;
1468 break;
1469 case SWR_TYPE_SNORM:
1470 conversionType = CONVERT_NORMALIZED;
1471 case SWR_TYPE_SINT:
1472 extendCastType = Instruction::CastOps::SExt;
1473 break;
1474 case SWR_TYPE_USCALED:
1475 conversionType = CONVERT_USCALED;
1476 extendCastType = Instruction::CastOps::UIToFP;
1477 break;
1478 case SWR_TYPE_SSCALED:
1479 conversionType = CONVERT_SSCALED;
1480 extendCastType = Instruction::CastOps::SIToFP;
1481 break;
1482 case SWR_TYPE_SFIXED:
1483 conversionType = CONVERT_SFIXED;
1484 extendCastType = Instruction::CastOps::SExt;
1485 break;
1486 default:
1487 break;
1488 }
1489
1490 // value substituted when component of gather is masked
1491 Value* gatherSrc = VIMMED1(0);
1492 #if USE_SIMD16_GATHERS
1493 Value* gatherSrc2 = VIMMED1(0);
1494 #endif
1495
1496 // Gather components from memory to store in a simdvertex structure
1497 switch (bpc)
1498 {
1499 case 8:
1500 {
1501 // if we have at least one component to fetch
1502 if (compMask)
1503 {
1504 #if USE_SIMD16_GATHERS
1505 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1506 Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
1507 // e.g. result of an 8x32bit integer gather for 8bit components
1508 // 256i - 0 1 2 3 4 5 6 7
1509 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1510
1511 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1512 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1513 Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1514 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle);
1515
1516 // Shuffle gathered components into place in simdvertex struct
1517 Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
1518 Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
1519 #else
1520 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1521 // e.g. result of an 8x32bit integer gather for 8bit components
1522 // 256i - 0 1 2 3 4 5 6 7
1523 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1524
1525 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1526 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
1527
1528 // Shuffle gathered components into place in simdvertex struct
1529 #if USE_SIMD16_SHADERS
1530 Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
1531 #else
1532 Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
1533 #endif
1534 #endif
1535 }
1536 }
1537 break;
1538 case 16:
1539 {
1540 #if USE_SIMD16_GATHERS
1541 Value* vGatherResult[2];
1542 Value *vMask;
1543 Value* vGatherResult2[2];
1544 Value *vMask2;
1545
1546 // if we have at least one component out of x or y to fetch
1547 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1548 {
1549 // save mask as it is zero'd out after each gather
1550 vMask = vGatherMask;
1551 vMask2 = vGatherMask2;
1552
1553 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1554 vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1555 // e.g. result of first 8x32bit integer gather for 16bit components
1556 // 256i - 0 1 2 3 4 5 6 7
1557 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1558 //
1559 }
1560
1561 // if we have at least one component out of z or w to fetch
1562 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1563 {
1564 // offset base to the next components(zw) in the vertex to gather
1565 pStreamBase = GEP(pStreamBase, C((char)4));
1566 vMask = vGatherMask;
1567 vMask2 = vGatherMask2;
1568
1569 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1570 vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1571 // e.g. result of second 8x32bit integer gather for 16bit components
1572 // 256i - 0 1 2 3 4 5 6 7
1573 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1574 //
1575 }
1576
1577 // if we have at least one component to shuffle into place
1578 if (compMask)
1579 {
1580 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1581 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1582 Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
1583 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2);
1584
1585 // Shuffle gathered components into place in simdvertex struct
1586 Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
1587 Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
1588 }
1589 #else
1590 Value* vGatherResult[2];
1591 Value *vMask;
1592
1593 // if we have at least one component out of x or y to fetch
1594 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1595 // save mask as it is zero'd out after each gather
1596 vMask = vGatherMask;
1597
1598 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1599 // e.g. result of first 8x32bit integer gather for 16bit components
1600 // 256i - 0 1 2 3 4 5 6 7
1601 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1602 //
1603 }
1604
1605 // if we have at least one component out of z or w to fetch
1606 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1607 // offset base to the next components(zw) in the vertex to gather
1608 pStreamBase = GEP(pStreamBase, C((char)4));
1609 vMask = vGatherMask;
1610
1611 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1612 // e.g. result of second 8x32bit integer gather for 16bit components
1613 // 256i - 0 1 2 3 4 5 6 7
1614 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1615 //
1616 }
1617
1618 // if we have at least one component to shuffle into place
1619 if(compMask){
1620 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
1621 currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
1622
1623 // Shuffle gathered components into place in simdvertex struct
1624 #if USE_SIMD16_SHADERS
1625 Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
1626 #else
1627 Shuffle16bpcGather(args); // outputs to vVertexElements ref
1628 #endif
1629 }
1630 #endif
1631 }
1632 break;
1633 case 32:
1634 {
1635 // Gathered components into place in simdvertex struct
1636 for (uint32_t i = 0; i < 4; i++)
1637 {
1638 if (isComponentEnabled(compMask, i))
1639 {
1640 // if we need to gather the component
1641 if (compCtrl[i] == StoreSrc)
1642 {
1643 #if USE_SIMD16_GATHERS
1644 // save mask as it is zero'd out after each gather
1645 Value *vMask = vGatherMask;
1646 Value *vMask2 = vGatherMask2;
1647
1648 Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1649 Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
1650
1651 if (conversionType == CONVERT_USCALED)
1652 {
1653 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1654 pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty);
1655 }
1656 else if (conversionType == CONVERT_SSCALED)
1657 {
1658 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1659 pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty);
1660 }
1661 else if (conversionType == CONVERT_SFIXED)
1662 {
1663 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1664 pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
1665 }
1666
1667 vVertexElements[currentVertexElement] = pGather;
1668 vVertexElements2[currentVertexElement] = pGather2;
1669 // e.g. result of a single 8x32bit integer gather for 32bit components
1670 // 256i - 0 1 2 3 4 5 6 7
1671 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1672
1673 currentVertexElement += 1;
1674 #else
1675 // save mask as it is zero'd out after each gather
1676 Value *vMask = vGatherMask;
1677
1678 Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
1679
1680 if (conversionType == CONVERT_USCALED)
1681 {
1682 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1683 }
1684 else if (conversionType == CONVERT_SSCALED)
1685 {
1686 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1687 }
1688 else if (conversionType == CONVERT_SFIXED)
1689 {
1690 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
1691 }
1692
1693 vVertexElements[currentVertexElement++] = pGather;
1694 // e.g. result of a single 8x32bit integer gather for 32bit components
1695 // 256i - 0 1 2 3 4 5 6 7
1696 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1697 #endif
1698 }
1699 else
1700 {
1701 #if USE_SIMD16_SHADERS
1702 #if USE_SIMD16_GATHERS
1703 vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
1704 vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
1705
1706 currentVertexElement += 1;
1707 #else
1708 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1709 #endif
1710 #else
1711 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1712 #endif
1713 }
1714
1715 if (currentVertexElement > 3)
1716 {
1717 #if USE_SIMD16_GATHERS
1718 StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
1719 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
1720
1721 outputElt += 1;
1722 #else
1723 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1724 #endif
1725
1726 // reset to the next vVertexElement to output
1727 currentVertexElement = 0;
1728 }
1729
1730 }
1731
1732 // offset base to the next component in the vertex to gather
1733 pStreamBase = GEP(pStreamBase, C((char)4));
1734 }
1735 }
1736 break;
1737 }
1738 }
1739 }
1740
1741 // if we have a partially filled vVertexElement struct, output it
1742 if (currentVertexElement > 0)
1743 {
1744 #if USE_SIMD16_GATHERS
1745 StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
1746 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
1747
1748 outputElt += 1;
1749 #else
1750 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1751 #endif
1752 }
1753 }
1754
1755 //////////////////////////////////////////////////////////////////////////
1756 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1757 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1758 /// support
1759 /// @param pIndices - pointer to 8 bit indices
1760 /// @param pLastIndex - pointer to last valid index
1761 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1762 {
1763 // can fit 2 16 bit integers per vWidth lane
1764 Value* vIndices = VUNDEF_I();
1765
1766 // store 0 index on stack to be used to conditionally load from if index address is OOB
1767 Value* pZeroIndex = ALLOCA(mInt8Ty);
1768 STORE(C((uint8_t)0), pZeroIndex);
1769
1770 // Load a SIMD of index pointers
1771 for(int64_t lane = 0; lane < mVWidth; lane++)
1772 {
1773 // Calculate the address of the requested index
1774 Value *pIndex = GEP(pIndices, C(lane));
1775
1776 // check if the address is less than the max index,
1777 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1778
1779 // if valid, load the index. if not, load 0 from the stack
1780 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1781 Value *index = LOAD(pValid, "valid index");
1782
1783 // zero extended index to 32 bits and insert into the correct simd lane
1784 index = Z_EXT(index, mInt32Ty);
1785 vIndices = VINSERT(vIndices, index, lane);
1786 }
1787 return vIndices;
1788 }
1789
1790 //////////////////////////////////////////////////////////////////////////
1791 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1792 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1793 /// support
1794 /// @param pIndices - pointer to 16 bit indices
1795 /// @param pLastIndex - pointer to last valid index
1796 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1797 {
1798 // can fit 2 16 bit integers per vWidth lane
1799 Value* vIndices = VUNDEF_I();
1800
1801 // store 0 index on stack to be used to conditionally load from if index address is OOB
1802 Value* pZeroIndex = ALLOCA(mInt16Ty);
1803 STORE(C((uint16_t)0), pZeroIndex);
1804
1805 // Load a SIMD of index pointers
1806 for(int64_t lane = 0; lane < mVWidth; lane++)
1807 {
1808 // Calculate the address of the requested index
1809 Value *pIndex = GEP(pIndices, C(lane));
1810
1811 // check if the address is less than the max index,
1812 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1813
1814 // if valid, load the index. if not, load 0 from the stack
1815 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1816 Value *index = LOAD(pValid, "valid index");
1817
1818 // zero extended index to 32 bits and insert into the correct simd lane
1819 index = Z_EXT(index, mInt32Ty);
1820 vIndices = VINSERT(vIndices, index, lane);
1821 }
1822 return vIndices;
1823 }
1824
1825 //////////////////////////////////////////////////////////////////////////
1826 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1827 /// @param pIndices - pointer to 32 bit indices
1828 /// @param pLastIndex - pointer to last valid index
1829 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1830 {
1831 DataLayout dL(JM()->mpCurrentModule);
1832 unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits
1833 Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
1834 Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
1835
1836 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1837 Value* numIndicesLeft = SUB(iLastIndex,iIndices);
1838 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1839 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1840
1841 // create a vector of index counts from the base index ptr passed into the fetch
1842 const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
1843 Constant* vIndexOffsets = ConstantVector::get(vecIndices);
1844
1845 // compare index count to the max valid index
1846 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1847 // vIndexOffsets 0 1 2 3 4 5 6 7
1848 // ------------------------------
1849 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1850 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1851 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1852 Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
1853
1854 // VMASKLOAD takes an *i8 src pointer
1855 pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
1856
1857 // Load the indices; OOB loads 0
1858 return MASKLOADD(pIndices,vIndexMask);
1859 }
1860
1861 //////////////////////////////////////////////////////////////////////////
1862 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1863 /// denormalizes if needed, converts to F32 if needed, and positions in
1864 // the proper SIMD rows to be output to the simdvertex structure
1865 /// @param args: (tuple of args, listed below)
1866 /// @param vGatherResult - 8 gathered 8bpc vertices
1867 /// @param pVtxOut - base pointer to output simdvertex struct
1868 /// @param extendType - sign extend or zero extend
1869 /// @param bNormalized - do we need to denormalize?
1870 /// @param currentVertexElement - reference to the current vVertexElement
1871 /// @param outputElt - reference to the current offset from simdvertex we're o
1872 /// @param compMask - component packing mask
1873 /// @param compCtrl - component control val
1874 /// @param vVertexElements[4] - vertex components to output
1875 /// @param swizzle[4] - component swizzle location
1876 #if USE_SIMD16_SHADERS
1877 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
1878 #else
1879 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
1880 #endif
1881 {
1882 // Unpack tuple args
1883 Value*& vGatherResult = std::get<0>(args);
1884 Value* pVtxOut = std::get<1>(args);
1885 const Instruction::CastOps extendType = std::get<2>(args);
1886 const ConversionType conversionType = std::get<3>(args);
1887 uint32_t &currentVertexElement = std::get<4>(args);
1888 uint32_t &outputElt = std::get<5>(args);
1889 const ComponentEnable compMask = std::get<6>(args);
1890 const ComponentControl (&compCtrl)[4] = std::get<7>(args);
1891 Value* (&vVertexElements)[4] = std::get<8>(args);
1892 const uint32_t (&swizzle)[4] = std::get<9>(args);
1893
1894 // cast types
1895 Type* vGatherTy = mSimdInt32Ty;
1896 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1897
1898 // have to do extra work for sign extending
1899 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
1900 Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
1901 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1902
1903 // shuffle mask, including any swizzling
1904 const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
1905 const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
1906 Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
1907 char(y), char(y+4), char(y+8), char(y+12),
1908 char(z), char(z+4), char(z+8), char(z+12),
1909 char(w), char(w+4), char(w+8), char(w+12),
1910 char(x), char(x+4), char(x+8), char(x+12),
1911 char(y), char(y+4), char(y+8), char(y+12),
1912 char(z), char(z+4), char(z+8), char(z+12),
1913 char(w), char(w+4), char(w+8), char(w+12)});
1914
1915 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
1916 // after pshufb: group components together in each 128bit lane
1917 // 256i - 0 1 2 3 4 5 6 7
1918 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1919
1920 Value* vi128XY = nullptr;
1921 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
1922 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1923 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1924 // 256i - 0 1 2 3 4 5 6 7
1925 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1926 }
1927
1928 // do the same for zw components
1929 Value* vi128ZW = nullptr;
1930 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
1931 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1932 }
1933
1934 // init denormalize variables if needed
1935 Instruction::CastOps fpCast;
1936 Value* conversionFactor;
1937
1938 switch (conversionType)
1939 {
1940 case CONVERT_NORMALIZED:
1941 fpCast = Instruction::CastOps::SIToFP;
1942 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1943 break;
1944 case CONVERT_SSCALED:
1945 fpCast = Instruction::CastOps::SIToFP;
1946 conversionFactor = VIMMED1((float)(1.0));
1947 break;
1948 case CONVERT_USCALED:
1949 SWR_INVALID("Type should not be sign extended!");
1950 conversionFactor = nullptr;
1951 break;
1952 default:
1953 SWR_ASSERT(conversionType == CONVERT_NONE);
1954 conversionFactor = nullptr;
1955 break;
1956 }
1957
1958 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1959 for (uint32_t i = 0; i < 4; i++)
1960 {
1961 if (isComponentEnabled(compMask, i))
1962 {
1963 if (compCtrl[i] == ComponentControl::StoreSrc)
1964 {
1965 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1966 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1967 // if x or y, use vi128XY permute result, else use vi128ZW
1968 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1969
1970 // sign extend
1971 vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
1972
1973 // denormalize if needed
1974 if (conversionType != CONVERT_NONE)
1975 {
1976 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
1977 }
1978 currentVertexElement++;
1979 }
1980 else
1981 {
1982 #if USE_SIMD16_SHADERS
1983 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
1984 #else
1985 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1986 #endif
1987 }
1988
1989 if (currentVertexElement > 3)
1990 {
1991 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1992 // reset to the next vVertexElement to output
1993 currentVertexElement = 0;
1994 }
1995 }
1996 }
1997 }
1998 // else zero extend
1999 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2000 {
2001 // init denormalize variables if needed
2002 Instruction::CastOps fpCast;
2003 Value* conversionFactor;
2004
2005 switch (conversionType)
2006 {
2007 case CONVERT_NORMALIZED:
2008 fpCast = Instruction::CastOps::UIToFP;
2009 conversionFactor = VIMMED1((float)(1.0 / 255.0));
2010 break;
2011 case CONVERT_USCALED:
2012 fpCast = Instruction::CastOps::UIToFP;
2013 conversionFactor = VIMMED1((float)(1.0));
2014 break;
2015 case CONVERT_SSCALED:
2016 SWR_INVALID("Type should not be zero extended!");
2017 conversionFactor = nullptr;
2018 break;
2019 default:
2020 SWR_ASSERT(conversionType == CONVERT_NONE);
2021 conversionFactor = nullptr;
2022 break;
2023 }
2024
2025 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
2026 for (uint32_t i = 0; i < 4; i++)
2027 {
2028 if (isComponentEnabled(compMask, i))
2029 {
2030 if (compCtrl[i] == ComponentControl::StoreSrc)
2031 {
2032 // pshufb masks for each component
2033 Value* vConstMask;
2034 switch (swizzle[i])
2035 {
2036 case 0:
2037 // x shuffle mask
2038 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
2039 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
2040 break;
2041 case 1:
2042 // y shuffle mask
2043 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
2044 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
2045 break;
2046 case 2:
2047 // z shuffle mask
2048 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
2049 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
2050 break;
2051 case 3:
2052 // w shuffle mask
2053 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
2054 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
2055 break;
2056 default:
2057 vConstMask = nullptr;
2058 break;
2059 }
2060
2061 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
2062 // after pshufb for x channel
2063 // 256i - 0 1 2 3 4 5 6 7
2064 // x000 x000 x000 x000 x000 x000 x000 x000
2065
2066 // denormalize if needed
2067 if (conversionType != CONVERT_NONE)
2068 {
2069 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2070 }
2071 currentVertexElement++;
2072 }
2073 else
2074 {
2075 #if USE_SIMD16_SHADERS
2076 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2077 #else
2078 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2079 #endif
2080 }
2081
2082 if (currentVertexElement > 3)
2083 {
2084 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2085 // reset to the next vVertexElement to output
2086 currentVertexElement = 0;
2087 }
2088 }
2089 }
2090 }
2091 else
2092 {
2093 SWR_INVALID("Unsupported conversion type");
2094 }
2095 }
2096
2097 //////////////////////////////////////////////////////////////////////////
2098 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
2099 /// denormalizes if needed, converts to F32 if needed, and positions in
2100 // the proper SIMD rows to be output to the simdvertex structure
2101 /// @param args: (tuple of args, listed below)
2102 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
2103 /// @param pVtxOut - base pointer to output simdvertex struct
2104 /// @param extendType - sign extend or zero extend
2105 /// @param bNormalized - do we need to denormalize?
2106 /// @param currentVertexElement - reference to the current vVertexElement
2107 /// @param outputElt - reference to the current offset from simdvertex we're o
2108 /// @param compMask - component packing mask
2109 /// @param compCtrl - component control val
2110 /// @param vVertexElements[4] - vertex components to output
2111 #if USE_SIMD16_SHADERS
2112 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
2113 #else
2114 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
2115 #endif
2116 {
2117 // Unpack tuple args
2118 Value* (&vGatherResult)[2] = std::get<0>(args);
2119 Value* pVtxOut = std::get<1>(args);
2120 const Instruction::CastOps extendType = std::get<2>(args);
2121 const ConversionType conversionType = std::get<3>(args);
2122 uint32_t &currentVertexElement = std::get<4>(args);
2123 uint32_t &outputElt = std::get<5>(args);
2124 const ComponentEnable compMask = std::get<6>(args);
2125 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
2126 Value* (&vVertexElements)[4] = std::get<8>(args);
2127
2128 // cast types
2129 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
2130 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
2131
2132 // have to do extra work for sign extending
2133 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
2134 (extendType == Instruction::CastOps::FPExt))
2135 {
2136 // is this PP float?
2137 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
2138
2139 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
2140 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
2141
2142 // shuffle mask
2143 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
2144 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
2145 Value* vi128XY = nullptr;
2146 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
2147 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
2148 // after pshufb: group components together in each 128bit lane
2149 // 256i - 0 1 2 3 4 5 6 7
2150 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
2151
2152 vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2153 // after PERMD: move and pack xy components into each 128bit lane
2154 // 256i - 0 1 2 3 4 5 6 7
2155 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
2156 }
2157
2158 // do the same for zw components
2159 Value* vi128ZW = nullptr;
2160 if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
2161 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
2162 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2163 }
2164
2165 // init denormalize variables if needed
2166 Instruction::CastOps IntToFpCast;
2167 Value* conversionFactor;
2168
2169 switch (conversionType)
2170 {
2171 case CONVERT_NORMALIZED:
2172 IntToFpCast = Instruction::CastOps::SIToFP;
2173 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2174 break;
2175 case CONVERT_SSCALED:
2176 IntToFpCast = Instruction::CastOps::SIToFP;
2177 conversionFactor = VIMMED1((float)(1.0));
2178 break;
2179 case CONVERT_USCALED:
2180 SWR_INVALID("Type should not be sign extended!");
2181 conversionFactor = nullptr;
2182 break;
2183 default:
2184 SWR_ASSERT(conversionType == CONVERT_NONE);
2185 conversionFactor = nullptr;
2186 break;
2187 }
2188
2189 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
2190 for (uint32_t i = 0; i < 4; i++)
2191 {
2192 if (isComponentEnabled(compMask, i))
2193 {
2194 if (compCtrl[i] == ComponentControl::StoreSrc)
2195 {
2196 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2197 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2198 // if x or y, use vi128XY permute result, else use vi128ZW
2199 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2200
2201 if (bFP) {
2202 // extract 128 bit lanes to sign extend each component
2203 vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2204 }
2205 else {
2206 // extract 128 bit lanes to sign extend each component
2207 vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2208
2209 // denormalize if needed
2210 if (conversionType != CONVERT_NONE) {
2211 vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2212 }
2213 }
2214 currentVertexElement++;
2215 }
2216 else
2217 {
2218 #if USE_SIMD16_SHADERS
2219 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2220 #else
2221 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2222 #endif
2223 }
2224
2225 if (currentVertexElement > 3)
2226 {
2227 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2228 // reset to the next vVertexElement to output
2229 currentVertexElement = 0;
2230 }
2231 }
2232 }
2233 }
2234 // else zero extend
2235 else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
2236 {
2237 // pshufb masks for each component
2238 Value* vConstMask[2];
2239 if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
2240 // x/z shuffle mask
2241 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2242 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
2243 }
2244
2245 if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
2246 // y/w shuffle mask
2247 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2248 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2249 }
2250
2251 // init denormalize variables if needed
2252 Instruction::CastOps fpCast;
2253 Value* conversionFactor;
2254
2255 switch (conversionType)
2256 {
2257 case CONVERT_NORMALIZED:
2258 fpCast = Instruction::CastOps::UIToFP;
2259 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2260 break;
2261 case CONVERT_USCALED:
2262 fpCast = Instruction::CastOps::UIToFP;
2263 conversionFactor = VIMMED1((float)(1.0f));
2264 break;
2265 case CONVERT_SSCALED:
2266 SWR_INVALID("Type should not be zero extended!");
2267 conversionFactor = nullptr;
2268 break;
2269 default:
2270 SWR_ASSERT(conversionType == CONVERT_NONE);
2271 conversionFactor = nullptr;
2272 break;
2273 }
2274
2275 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2276 for (uint32_t i = 0; i < 4; i++)
2277 {
2278 if (isComponentEnabled(compMask, i))
2279 {
2280 if (compCtrl[i] == ComponentControl::StoreSrc)
2281 {
2282 // select correct constMask for x/z or y/w pshufb
2283 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2284 // if x or y, use vi128XY permute result, else use vi128ZW
2285 uint32_t selectedGather = (i < 2) ? 0 : 1;
2286
2287 vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
2288 // after pshufb mask for x channel; z uses the same shuffle from the second gather
2289 // 256i - 0 1 2 3 4 5 6 7
2290 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2291
2292 // denormalize if needed
2293 if (conversionType != CONVERT_NONE)
2294 {
2295 vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
2296 }
2297 currentVertexElement++;
2298 }
2299 else
2300 {
2301 #if USE_SIMD16_SHADERS
2302 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
2303 #else
2304 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2305 #endif
2306 }
2307
2308 if (currentVertexElement > 3)
2309 {
2310 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2311 // reset to the next vVertexElement to output
2312 currentVertexElement = 0;
2313 }
2314 }
2315 }
2316 }
2317 else
2318 {
2319 SWR_INVALID("Unsupported conversion type");
2320 }
2321 }
2322
2323 //////////////////////////////////////////////////////////////////////////
2324 /// @brief Output a simdvertex worth of elements to the current outputElt
2325 /// @param pVtxOut - base address of VIN output struct
2326 /// @param outputElt - simdvertex offset in VIN to write to
2327 /// @param numEltsToStore - number of simdvertex rows to write out
2328 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2329 void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2330 {
2331 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2332
2333 for(uint32_t c = 0; c < numEltsToStore; ++c)
2334 {
2335 // STORE expects FP32 x vWidth type, just bitcast if needed
2336 if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2337 {
2338 #if FETCH_DUMP_VERTEX
2339 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2340 #endif
2341 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2342 }
2343 #if FETCH_DUMP_VERTEX
2344 else
2345 {
2346 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2347 }
2348 #endif
2349 // outputElt * 4 = offsetting by the size of a simdvertex
2350 // + c offsets to a 32bit x vWidth row within the current vertex
2351 #if USE_SIMD16_SHADERS
2352 Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
2353 #else
2354 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2355 #endif
2356 STORE(vVertexElements[c], dest);
2357 }
2358 }
2359
2360 #if USE_SIMD16_BUILDER
2361 void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
2362 {
2363 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2364
2365 for (uint32_t c = 0; c < numEltsToStore; ++c)
2366 {
2367 // STORE expects FP32 x vWidth type, just bitcast if needed
2368 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2369 {
2370 #if FETCH_DUMP_VERTEX
2371 PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
2372 #endif
2373 vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty);
2374 }
2375 #if FETCH_DUMP_VERTEX
2376 else
2377 {
2378 PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
2379 }
2380 #endif
2381 // outputElt * 4 = offsetting by the size of a simdvertex
2382 // + c offsets to a 32bit x vWidth row within the current vertex
2383 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
2384 STORE(vVertexElements[c], dest);
2385 }
2386 }
2387
2388 #endif
2389 //////////////////////////////////////////////////////////////////////////
2390 /// @brief Generates a constant vector of values based on the
2391 /// ComponentControl value
2392 /// @param ctrl - ComponentControl value
2393 #if USE_SIMD16_SHADERS
2394 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
2395 #else
2396 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2397 #endif
2398 {
2399 switch(ctrl)
2400 {
2401 case NoStore: return VUNDEF_I();
2402 case Store0: return VIMMED1(0);
2403 case Store1Fp: return VIMMED1(1.0f);
2404 case Store1Int: return VIMMED1(1);
2405 case StoreVertexId:
2406 {
2407 #if USE_SIMD16_SHADERS
2408 Value* pId;
2409 if (useVertexID2)
2410 {
2411 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
2412 }
2413 else
2414 {
2415 pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2416 }
2417 #else
2418 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
2419 #endif
2420 return VBROADCAST(pId);
2421 }
2422 case StoreInstanceId:
2423 {
2424 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
2425 return VBROADCAST(pId);
2426 }
2427 case StoreSrc:
2428 default: SWR_INVALID("Invalid component control"); return VUNDEF_I();
2429 }
2430 }
2431
2432 //////////////////////////////////////////////////////////////////////////
2433 /// @brief Returns the enable mask for the specified component.
2434 /// @param enableMask - enable bits
2435 /// @param component - component to check if enabled.
2436 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2437 {
2438 switch (component)
2439 {
2440 // X
2441 case 0: return (enableMask & ComponentEnable::X);
2442 // Y
2443 case 1: return (enableMask & ComponentEnable::Y);
2444 // Z
2445 case 2: return (enableMask & ComponentEnable::Z);
2446 // W
2447 case 3: return (enableMask & ComponentEnable::W);
2448
2449 default: return false;
2450 }
2451 }
2452
2453
2454 //////////////////////////////////////////////////////////////////////////
2455 /// @brief JITs from fetch shader IR
2456 /// @param hJitMgr - JitManager handle
2457 /// @param func - LLVM function IR
2458 /// @return PFN_FETCH_FUNC - pointer to fetch code
2459 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2460 {
2461 const llvm::Function* func = (const llvm::Function*)hFunc;
2462 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2463 PFN_FETCH_FUNC pfnFetch;
2464
2465 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2466 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
2467 pJitMgr->mIsModuleFinalized = true;
2468
2469 #if defined(KNOB_SWRC_TRACING)
2470 char fName[1024];
2471 const char *funcName = func->getName().data();
2472 sprintf(fName, "%s.bin", funcName);
2473 FILE *fd = fopen(fName, "wb");
2474 fwrite((void *)pfnFetch, 1, 2048, fd);
2475 fclose(fd);
2476 #endif
2477
2478 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2479
2480 return pfnFetch;
2481 }
2482
2483 //////////////////////////////////////////////////////////////////////////
2484 /// @brief JIT compiles fetch shader
2485 /// @param hJitMgr - JitManager handle
2486 /// @param state - fetch state to build function from
2487 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2488 {
2489 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2490
2491 pJitMgr->SetupNewModule();
2492
2493 FetchJit theJit(pJitMgr);
2494 HANDLE hFunc = theJit.Create(state);
2495
2496 return JitFetchFunc(hJitMgr, hFunc);
2497 }