swr/rast: Don't include private context in gather args
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_mem.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "common/rdtsc_buckets.h"
33
34 #include <cstdarg>
35
36
37 namespace SwrJit
38 {
39
40 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
41 {
42 std::vector<Value*> indices;
43 for (auto i : indexList)
44 indices.push_back(i);
45 return GEPA(ptr, indices);
46 }
47
48 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
49 {
50 std::vector<Value*> indices;
51 for (auto i : indexList)
52 indices.push_back(C(i));
53 return GEPA(ptr, indices);
54 }
55
56 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
57 {
58 std::vector<Value*> indices;
59 for (auto i : indexList)
60 indices.push_back(i);
61 return IN_BOUNDS_GEP(ptr, indices);
62 }
63
64 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
65 {
66 std::vector<Value*> indices;
67 for (auto i : indexList)
68 indices.push_back(C(i));
69 return IN_BOUNDS_GEP(ptr, indices);
70 }
71
72 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
73 {
74 std::vector<Value*> valIndices;
75 for (auto i : indices)
76 valIndices.push_back(C(i));
77 return LOAD(GEPA(basePtr, valIndices), name);
78 }
79
80 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
81 {
82 std::vector<Value*> valIndices;
83 for (auto i : indices)
84 valIndices.push_back(i);
85 return LOAD(GEPA(basePtr, valIndices), name);
86 }
87
88 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
89 {
90 std::vector<Value*> valIndices;
91 for (auto i : indices)
92 valIndices.push_back(C(i));
93 return STORE(val, GEPA(basePtr, valIndices));
94 }
95
96 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
97 {
98 std::vector<Value*> valIndices;
99 for (auto i : indices)
100 valIndices.push_back(i);
101 return STORE(val, GEPA(basePtr, valIndices));
102 }
103
104 //////////////////////////////////////////////////////////////////////////
105 /// @brief Generate an i32 masked load operation in LLVM IR. If not
106 /// supported on the underlying platform, emulate it with float masked load
107 /// @param src - base address pointer for the load
108 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
109 Value *Builder::MASKLOADD(Value* src, Value* mask)
110 {
111 Value* vResult;
112 // use avx2 gather instruction is available
113 if (JM()->mArch.AVX2())
114 {
115 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
116 vResult = CALL(func, { src,mask });
117 }
118 else
119 {
120 // maskload intrinsic expects integer mask operand in llvm >= 3.8
121 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
122 mask = BITCAST(mask, VectorType::get(mInt32Ty, mVWidth));
123 #else
124 mask = BITCAST(mask, VectorType::get(mFP32Ty, mVWidth));
125 #endif
126 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256);
127 vResult = BITCAST(CALL(func, { src,mask }), VectorType::get(mInt32Ty, mVWidth));
128 }
129 return vResult;
130 }
131
132 //////////////////////////////////////////////////////////////////////////
133 /// @brief Generate a masked gather operation in LLVM IR. If not
134 /// supported on the underlying platform, emulate it with loads
135 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
136 /// @param pBase - Int8* base VB address pointer value
137 /// @param vIndices - SIMD wide value of VB byte offsets
138 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
139 /// @param scale - value to scale indices by
140 Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
141 {
142 Value *vGather;
143
144 // use avx2 gather instruction if available
145 if (JM()->mArch.AVX2())
146 {
147 // force mask to <N x float>, required by vgather
148 Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
149
150 vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
151 }
152 else
153 {
154 Value* pStack = STACKSAVE();
155
156 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
157 Value* vSrcPtr = ALLOCA(vSrc->getType());
158 STORE(vSrc, vSrcPtr);
159
160 vGather = VUNDEF_F();
161 Value *vScaleVec = VIMMED1((uint32_t)scale);
162 Value *vOffsets = MUL(vIndices, vScaleVec);
163 for (uint32_t i = 0; i < mVWidth; ++i)
164 {
165 // single component byte index
166 Value *offset = VEXTRACT(vOffsets, C(i));
167 // byte pointer to component
168 Value *loadAddress = GEP(pBase, offset);
169 loadAddress = BITCAST(loadAddress, PointerType::get(mFP32Ty, 0));
170 // pointer to the value to load if we're masking off a component
171 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
172 Value *selMask = VEXTRACT(vMask, C(i));
173 // switch in a safe address to load if we're trying to access a vertex
174 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
175 Value *val = LOAD(validAddress);
176 vGather = VINSERT(vGather, val, C(i));
177 }
178
179 STACKRESTORE(pStack);
180 }
181
182 return vGather;
183 }
184
185 Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
186 {
187 Value *vGather = VUNDEF_F_16();
188
189 // use AVX512F gather instruction if available
190 if (JM()->mArch.AVX512F())
191 {
192 // force mask to <N-bit Integer>, required by vgather2
193 Value *mask = BITCAST(vMask, mInt16Ty);
194
195 vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
196 }
197 else
198 {
199 Value *src0 = EXTRACT_16(vSrc, 0);
200 Value *src1 = EXTRACT_16(vSrc, 1);
201
202 Value *indices0 = EXTRACT_16(vIndices, 0);
203 Value *indices1 = EXTRACT_16(vIndices, 1);
204
205 Value *mask0 = EXTRACT_16(vMask, 0);
206 Value *mask1 = EXTRACT_16(vMask, 1);
207
208 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
209 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
210
211 vGather = JOIN_16(gather0, gather1);
212 }
213
214 return vGather;
215 }
216
217 //////////////////////////////////////////////////////////////////////////
218 /// @brief Generate a masked gather operation in LLVM IR. If not
219 /// supported on the underlying platform, emulate it with loads
220 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
221 /// @param pBase - Int8* base VB address pointer value
222 /// @param vIndices - SIMD wide value of VB byte offsets
223 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
224 /// @param scale - value to scale indices by
225 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
226 {
227 Value* vGather;
228
229 // use avx2 gather instruction if available
230 if (JM()->mArch.AVX2())
231 {
232 vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
233 }
234 else
235 {
236 Value* pStack = STACKSAVE();
237
238 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
239 Value* vSrcPtr = ALLOCA(vSrc->getType());
240 STORE(vSrc, vSrcPtr);
241
242 vGather = VUNDEF_I();
243 Value *vScaleVec = VIMMED1((uint32_t)scale);
244 Value *vOffsets = MUL(vIndices, vScaleVec);
245 for (uint32_t i = 0; i < mVWidth; ++i)
246 {
247 // single component byte index
248 Value *offset = VEXTRACT(vOffsets, C(i));
249 // byte pointer to component
250 Value *loadAddress = GEP(pBase, offset);
251 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
252 // pointer to the value to load if we're masking off a component
253 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
254 Value *selMask = VEXTRACT(vMask, C(i));
255 // switch in a safe address to load if we're trying to access a vertex
256 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
257 Value *val = LOAD(validAddress, C(0));
258 vGather = VINSERT(vGather, val, C(i));
259 }
260
261 STACKRESTORE(pStack);
262 }
263
264 return vGather;
265 }
266
267 Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
268 {
269 Value *vGather = VUNDEF_I_16();
270
271 // use AVX512F gather instruction if available
272 if (JM()->mArch.AVX512F())
273 {
274 // force mask to <N-bit Integer>, required by vgather2
275 Value *mask = BITCAST(vMask, mInt16Ty);
276
277 vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
278 }
279 else
280 {
281 Value *src0 = EXTRACT_16(vSrc, 0);
282 Value *src1 = EXTRACT_16(vSrc, 1);
283
284 Value *indices0 = EXTRACT_16(vIndices, 0);
285 Value *indices1 = EXTRACT_16(vIndices, 1);
286
287 Value *mask0 = EXTRACT_16(vMask, 0);
288 Value *mask1 = EXTRACT_16(vMask, 1);
289
290 Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
291 Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
292
293 vGather = JOIN_16(gather0, gather1);
294 }
295
296 return vGather;
297 }
298
299 //////////////////////////////////////////////////////////////////////////
300 /// @brief Generate a masked gather operation in LLVM IR. If not
301 /// supported on the underlying platform, emulate it with loads
302 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
303 /// @param pBase - Int8* base VB address pointer value
304 /// @param vIndices - SIMD wide value of VB byte offsets
305 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
306 /// @param scale - value to scale indices by
307 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
308 {
309 Value* vGather;
310
311 // use avx2 gather instruction if available
312 if (JM()->mArch.AVX2())
313 {
314 vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
315 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
316 }
317 else
318 {
319 Value* pStack = STACKSAVE();
320
321 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
322 Value* vSrcPtr = ALLOCA(vSrc->getType());
323 STORE(vSrc, vSrcPtr);
324
325 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
326 Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
327 Value *vOffsets = MUL(vIndices, vScaleVec);
328 for (uint32_t i = 0; i < mVWidth / 2; ++i)
329 {
330 // single component byte index
331 Value *offset = VEXTRACT(vOffsets, C(i));
332 // byte pointer to component
333 Value *loadAddress = GEP(pBase, offset);
334 loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
335 // pointer to the value to load if we're masking off a component
336 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
337 Value *selMask = VEXTRACT(vMask, C(i));
338 // switch in a safe address to load if we're trying to access a vertex
339 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
340 Value *val = LOAD(validAddress);
341 vGather = VINSERT(vGather, val, C(i));
342 }
343 STACKRESTORE(pStack);
344 }
345 return vGather;
346 }
347
348 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
349 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
350 {
351 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
352 if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
353 {
354 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
355 }
356 else
357 {
358 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
359 }
360 }
361
362 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
363 Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
364 {
365 switch (info.bpp / info.numComps)
366 {
367 case 16:
368 {
369 Value* vGatherResult[2];
370
371 // TODO: vGatherMaskedVal
372 Value* vGatherMaskedVal = VIMMED1((float)0);
373
374 // always have at least one component out of x or y to fetch
375
376 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
377 // e.g. result of first 8x32bit integer gather for 16bit components
378 // 256i - 0 1 2 3 4 5 6 7
379 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
380 //
381
382 // if we have at least one component out of x or y to fetch
383 if (info.numComps > 2)
384 {
385 // offset base to the next components(zw) in the vertex to gather
386 pSrcBase = GEP(pSrcBase, C((char)4));
387
388 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
389 // e.g. result of second 8x32bit integer gather for 16bit components
390 // 256i - 0 1 2 3 4 5 6 7
391 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
392 //
393 }
394 else
395 {
396 vGatherResult[1] = vGatherMaskedVal;
397 }
398
399 // Shuffle gathered components into place, each row is a component
400 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
401 }
402 break;
403 case 32:
404 {
405 // apply defaults
406 for (uint32_t i = 0; i < 4; ++i)
407 {
408 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
409 }
410
411 for (uint32_t i = 0; i < info.numComps; i++)
412 {
413 uint32_t swizzleIndex = info.swizzle[i];
414
415 // Gather a SIMD of components
416 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
417
418 // offset base to the next component to gather
419 pSrcBase = GEP(pSrcBase, C((char)4));
420 }
421 }
422 break;
423 default:
424 SWR_INVALID("Invalid float format");
425 break;
426 }
427 }
428
429 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
430 Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
431 {
432 switch (info.bpp / info.numComps)
433 {
434 case 8:
435 {
436 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
437 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
438 // e.g. result of an 8x32bit integer gather for 8bit components
439 // 256i - 0 1 2 3 4 5 6 7
440 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
441
442 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
443 }
444 break;
445 case 16:
446 {
447 Value* vGatherResult[2];
448
449 // TODO: vGatherMaskedVal
450 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
451
452 // always have at least one component out of x or y to fetch
453
454 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
455 // e.g. result of first 8x32bit integer gather for 16bit components
456 // 256i - 0 1 2 3 4 5 6 7
457 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
458 //
459
460 // if we have at least one component out of x or y to fetch
461 if (info.numComps > 2)
462 {
463 // offset base to the next components(zw) in the vertex to gather
464 pSrcBase = GEP(pSrcBase, C((char)4));
465
466 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
467 // e.g. result of second 8x32bit integer gather for 16bit components
468 // 256i - 0 1 2 3 4 5 6 7
469 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
470 //
471 }
472 else
473 {
474 vGatherResult[1] = vGatherMaskedVal;
475 }
476
477 // Shuffle gathered components into place, each row is a component
478 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
479
480 }
481 break;
482 case 32:
483 {
484 // apply defaults
485 for (uint32_t i = 0; i < 4; ++i)
486 {
487 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
488 }
489
490 for (uint32_t i = 0; i < info.numComps; i++)
491 {
492 uint32_t swizzleIndex = info.swizzle[i];
493
494 // Gather a SIMD of components
495 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
496
497 // offset base to the next component to gather
498 pSrcBase = GEP(pSrcBase, C((char)4));
499 }
500 }
501 break;
502 default:
503 SWR_INVALID("unsupported format");
504 break;
505 }
506 }
507
508 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
509 {
510 // cast types
511 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
512 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
513
514 // input could either be float or int vector; do shuffle work in int
515 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
516 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
517
518 if (bPackedOutput)
519 {
520 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
521
522 // shuffle mask
523 Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
524 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
525 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
526 // after pshufb: group components together in each 128bit lane
527 // 256i - 0 1 2 3 4 5 6 7
528 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
529
530 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
531 // after PERMD: move and pack xy components into each 128bit lane
532 // 256i - 0 1 2 3 4 5 6 7
533 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
534
535 // do the same for zw components
536 Value* vi128ZW = nullptr;
537 if (info.numComps > 2)
538 {
539 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
540 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
541 }
542
543 for (uint32_t i = 0; i < 4; i++)
544 {
545 uint32_t swizzleIndex = info.swizzle[i];
546 // todo: fixed for packed
547 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
548 if (i >= info.numComps)
549 {
550 // set the default component val
551 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
552 continue;
553 }
554
555 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
556 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
557 // if x or y, use vi128XY permute result, else use vi128ZW
558 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
559
560 // extract packed component 128 bit lanes
561 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
562 }
563
564 }
565 else
566 {
567 // pshufb masks for each component
568 Value* vConstMask[2];
569 // x/z shuffle mask
570 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
571 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
572
573 // y/w shuffle mask
574 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
575 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
576
577
578 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
579 // apply defaults
580 for (uint32_t i = 0; i < 4; ++i)
581 {
582 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
583 }
584
585 for (uint32_t i = 0; i < info.numComps; i++)
586 {
587 uint32_t swizzleIndex = info.swizzle[i];
588
589 // select correct constMask for x/z or y/w pshufb
590 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
591 // if x or y, use vi128XY permute result, else use vi128ZW
592 uint32_t selectedGather = (i < 2) ? 0 : 1;
593
594 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
595 // after pshufb mask for x channel; z uses the same shuffle from the second gather
596 // 256i - 0 1 2 3 4 5 6 7
597 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
598 }
599 }
600 }
601
602 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
603 {
604 // cast types
605 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
606 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
607
608 if (bPackedOutput)
609 {
610 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
611 // shuffle mask
612 Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
613 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
614 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
615 // after pshufb: group components together in each 128bit lane
616 // 256i - 0 1 2 3 4 5 6 7
617 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
618
619 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
620 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
621 // 256i - 0 1 2 3 4 5 6 7
622 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
623
624 // do the same for zw components
625 Value* vi128ZW = nullptr;
626 if (info.numComps > 2)
627 {
628 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
629 }
630
631 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
632 for (uint32_t i = 0; i < 4; i++)
633 {
634 uint32_t swizzleIndex = info.swizzle[i];
635 // todo: fix for packed
636 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
637 if (i >= info.numComps)
638 {
639 // set the default component val
640 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
641 continue;
642 }
643
644 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
645 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
646 // if x or y, use vi128XY permute result, else use vi128ZW
647 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
648
649 // sign extend
650 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
651 }
652 }
653 // else zero extend
654 else {
655 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
656 // apply defaults
657 for (uint32_t i = 0; i < 4; ++i)
658 {
659 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
660 }
661
662 for (uint32_t i = 0; i < info.numComps; i++) {
663 uint32_t swizzleIndex = info.swizzle[i];
664
665 // pshufb masks for each component
666 Value* vConstMask;
667 switch (i)
668 {
669 case 0:
670 // x shuffle mask
671 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
672 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
673 break;
674 case 1:
675 // y shuffle mask
676 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
677 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
678 break;
679 case 2:
680 // z shuffle mask
681 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
682 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
683 break;
684 case 3:
685 // w shuffle mask
686 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
687 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
688 break;
689 default:
690 vConstMask = nullptr;
691 break;
692 }
693
694 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
695 // after pshufb for x channel
696 // 256i - 0 1 2 3 4 5 6 7
697 // x000 x000 x000 x000 x000 x000 x000 x000
698 }
699 }
700 }
701
702 //////////////////////////////////////////////////////////////////////////
703 /// @brief emulates a scatter operation.
704 /// @param pDst - pointer to destination
705 /// @param vSrc - vector of src data to scatter
706 /// @param vOffsets - vector of byte offsets from pDst
707 /// @param vMask - mask of valid lanes
708 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
709 {
710 /* Scatter algorithm
711
712 while(Index = BitScanForward(mask))
713 srcElem = srcVector[Index]
714 offsetElem = offsetVector[Index]
715 *(pDst + offsetElem) = srcElem
716 Update mask (&= ~(1<<Index)
717
718 */
719
720 BasicBlock* pCurBB = IRB()->GetInsertBlock();
721 Function* pFunc = pCurBB->getParent();
722 Type* pSrcTy = vSrc->getType()->getVectorElementType();
723
724 // Store vectors on stack
725 if (pScatterStackSrc == nullptr)
726 {
727 // Save off stack allocations and reuse per scatter. Significantly reduces stack
728 // requirements for shaders with a lot of scatters.
729 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
730 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
731 }
732
733 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
734 Value* pOffsetsArrayPtr = pScatterStackOffsets;
735 STORE(vSrc, pSrcArrayPtr);
736 STORE(vOffsets, pOffsetsArrayPtr);
737
738 // Cast to pointers for random access
739 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
740 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
741
742 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
743
744 // Get cttz function
745 Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
746
747 // Setup loop basic block
748 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
749
750 // compute first set bit
751 Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
752
753 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
754
755 // Split current block
756 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
757
758 // Remove unconditional jump created by splitBasicBlock
759 pCurBB->getTerminator()->eraseFromParent();
760
761 // Add terminator to end of original block
762 IRB()->SetInsertPoint(pCurBB);
763
764 // Add conditional branch
765 COND_BR(pIsUndef, pPostLoop, pLoop);
766
767 // Add loop basic block contents
768 IRB()->SetInsertPoint(pLoop);
769 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
770 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
771
772 pIndexPhi->addIncoming(pIndex, pCurBB);
773 pMaskPhi->addIncoming(pMask, pCurBB);
774
775 // Extract elements for this index
776 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
777 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
778
779 // GEP to this offset in dst
780 Value* pCurDst = GEP(pDst, pOffsetElem);
781 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
782 STORE(pSrcElem, pCurDst);
783
784 // Update the mask
785 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
786
787 // Terminator
788 Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
789
790 pIsUndef = ICMP_EQ(pNewIndex, C(32));
791 COND_BR(pIsUndef, pPostLoop, pLoop);
792
793 // Update phi edges
794 pIndexPhi->addIncoming(pNewIndex, pLoop);
795 pMaskPhi->addIncoming(pNewMask, pLoop);
796
797 // Move builder to beginning of post loop
798 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
799 }
800
801 //////////////////////////////////////////////////////////////////////////
802 /// @brief save/restore stack, providing ability to push/pop the stack and
803 /// reduce overall stack requirements for temporary stack use
804 Value* Builder::STACKSAVE()
805 {
806 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
807 return CALLA(pfnStackSave);
808 }
809
810 void Builder::STACKRESTORE(Value* pSaved)
811 {
812 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
813 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
814 }
815
816 }