swr/rast: Fix GATHERPS to avoid assertions.
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_mem.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "common/rdtsc_buckets.h"
33
34 #include <cstdarg>
35
36
37 namespace SwrJit
38 {
39
40 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
41 {
42 std::vector<Value*> indices;
43 for (auto i : indexList)
44 indices.push_back(i);
45 return GEPA(ptr, indices);
46 }
47
48 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
49 {
50 std::vector<Value*> indices;
51 for (auto i : indexList)
52 indices.push_back(C(i));
53 return GEPA(ptr, indices);
54 }
55
56 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
57 {
58 std::vector<Value*> indices;
59 for (auto i : indexList)
60 indices.push_back(i);
61 return IN_BOUNDS_GEP(ptr, indices);
62 }
63
64 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
65 {
66 std::vector<Value*> indices;
67 for (auto i : indexList)
68 indices.push_back(C(i));
69 return IN_BOUNDS_GEP(ptr, indices);
70 }
71
72 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
73 {
74 std::vector<Value*> valIndices;
75 for (auto i : indices)
76 valIndices.push_back(C(i));
77 return LOAD(GEPA(basePtr, valIndices), name);
78 }
79
80 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
81 {
82 std::vector<Value*> valIndices;
83 for (auto i : indices)
84 valIndices.push_back(i);
85 return LOAD(GEPA(basePtr, valIndices), name);
86 }
87
88 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
89 {
90 std::vector<Value*> valIndices;
91 for (auto i : indices)
92 valIndices.push_back(C(i));
93 return STORE(val, GEPA(basePtr, valIndices));
94 }
95
96 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
97 {
98 std::vector<Value*> valIndices;
99 for (auto i : indices)
100 valIndices.push_back(i);
101 return STORE(val, GEPA(basePtr, valIndices));
102 }
103
104 //////////////////////////////////////////////////////////////////////////
105 /// @brief Generate an i32 masked load operation in LLVM IR. If not
106 /// supported on the underlying platform, emulate it with float masked load
107 /// @param src - base address pointer for the load
108 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
109 Value *Builder::MASKLOADD(Value* src, Value* mask)
110 {
111 Value* vResult;
112 // use avx2 gather instruction is available
113 if (JM()->mArch.AVX2())
114 {
115 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
116 vResult = CALL(func, { src,mask });
117 }
118 else
119 {
120 // maskload intrinsic expects integer mask operand in llvm >= 3.8
121 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
122 mask = BITCAST(mask, VectorType::get(mInt32Ty, mVWidth));
123 #else
124 mask = BITCAST(mask, VectorType::get(mFP32Ty, mVWidth));
125 #endif
126 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256);
127 vResult = BITCAST(CALL(func, { src,mask }), VectorType::get(mInt32Ty, mVWidth));
128 }
129 return vResult;
130 }
131
132 //////////////////////////////////////////////////////////////////////////
133 /// @brief Generate a masked gather operation in LLVM IR. If not
134 /// supported on the underlying platform, emulate it with loads
135 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
136 /// @param pBase - Int8* base VB address pointer value
137 /// @param vIndices - SIMD wide value of VB byte offsets
138 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
139 /// @param scale - value to scale indices by
140 Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
141 {
142 Value *vGather;
143 Value *pBasePtr = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
144
145 // use avx2 gather instruction if available
146 if (JM()->mArch.AVX2())
147 {
148 // force mask to <N x float>, required by vgather
149 Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
150
151 vGather = VGATHERPS(vSrc, pBasePtr, vIndices, mask, C(scale));
152 }
153 else
154 {
155 Value* pStack = STACKSAVE();
156
157 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
158 Value* vSrcPtr = ALLOCA(vSrc->getType());
159 STORE(vSrc, vSrcPtr);
160
161 vGather = VUNDEF_F();
162 Value *vScaleVec = VIMMED1((uint32_t)scale);
163 Value *vOffsets = MUL(vIndices, vScaleVec);
164 for (uint32_t i = 0; i < mVWidth; ++i)
165 {
166 // single component byte index
167 Value *offset = VEXTRACT(vOffsets, C(i));
168 // byte pointer to component
169 Value *loadAddress = GEP(pBasePtr, offset);
170 loadAddress = BITCAST(loadAddress, PointerType::get(mFP32Ty, 0));
171 // pointer to the value to load if we're masking off a component
172 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
173 Value *selMask = VEXTRACT(vMask, C(i));
174 // switch in a safe address to load if we're trying to access a vertex
175 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
176 Value *val = LOAD(validAddress);
177 vGather = VINSERT(vGather, val, C(i));
178 }
179
180 STACKRESTORE(pStack);
181 }
182
183 return vGather;
184 }
185
186 Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
187 {
188 Value *vGather = VUNDEF_F_16();
189
190 // use AVX512F gather instruction if available
191 if (JM()->mArch.AVX512F())
192 {
193 // force mask to <N-bit Integer>, required by vgather2
194 Value *mask = BITCAST(vMask, mInt16Ty);
195
196 vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
197 }
198 else
199 {
200 Value *src0 = EXTRACT_16(vSrc, 0);
201 Value *src1 = EXTRACT_16(vSrc, 1);
202
203 Value *indices0 = EXTRACT_16(vIndices, 0);
204 Value *indices1 = EXTRACT_16(vIndices, 1);
205
206 Value *mask0 = EXTRACT_16(vMask, 0);
207 Value *mask1 = EXTRACT_16(vMask, 1);
208
209 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
210 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
211
212 vGather = JOIN_16(gather0, gather1);
213 }
214
215 return vGather;
216 }
217
218 //////////////////////////////////////////////////////////////////////////
219 /// @brief Generate a masked gather operation in LLVM IR. If not
220 /// supported on the underlying platform, emulate it with loads
221 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
222 /// @param pBase - Int8* base VB address pointer value
223 /// @param vIndices - SIMD wide value of VB byte offsets
224 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
225 /// @param scale - value to scale indices by
226 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
227 {
228 Value* vGather;
229
230 // use avx2 gather instruction if available
231 if (JM()->mArch.AVX2())
232 {
233 vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
234 }
235 else
236 {
237 Value* pStack = STACKSAVE();
238
239 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
240 Value* vSrcPtr = ALLOCA(vSrc->getType());
241 STORE(vSrc, vSrcPtr);
242
243 vGather = VUNDEF_I();
244 Value *vScaleVec = VIMMED1((uint32_t)scale);
245 Value *vOffsets = MUL(vIndices, vScaleVec);
246 for (uint32_t i = 0; i < mVWidth; ++i)
247 {
248 // single component byte index
249 Value *offset = VEXTRACT(vOffsets, C(i));
250 // byte pointer to component
251 Value *loadAddress = GEP(pBase, offset);
252 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
253 // pointer to the value to load if we're masking off a component
254 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
255 Value *selMask = VEXTRACT(vMask, C(i));
256 // switch in a safe address to load if we're trying to access a vertex
257 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
258 Value *val = LOAD(validAddress, C(0));
259 vGather = VINSERT(vGather, val, C(i));
260 }
261
262 STACKRESTORE(pStack);
263 }
264
265 return vGather;
266 }
267
268 Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
269 {
270 Value *vGather = VUNDEF_I_16();
271
272 // use AVX512F gather instruction if available
273 if (JM()->mArch.AVX512F())
274 {
275 // force mask to <N-bit Integer>, required by vgather2
276 Value *mask = BITCAST(vMask, mInt16Ty);
277
278 vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
279 }
280 else
281 {
282 Value *src0 = EXTRACT_16(vSrc, 0);
283 Value *src1 = EXTRACT_16(vSrc, 1);
284
285 Value *indices0 = EXTRACT_16(vIndices, 0);
286 Value *indices1 = EXTRACT_16(vIndices, 1);
287
288 Value *mask0 = EXTRACT_16(vMask, 0);
289 Value *mask1 = EXTRACT_16(vMask, 1);
290
291 Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
292 Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
293
294 vGather = JOIN_16(gather0, gather1);
295 }
296
297 return vGather;
298 }
299
300 //////////////////////////////////////////////////////////////////////////
301 /// @brief Generate a masked gather operation in LLVM IR. If not
302 /// supported on the underlying platform, emulate it with loads
303 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
304 /// @param pBase - Int8* base VB address pointer value
305 /// @param vIndices - SIMD wide value of VB byte offsets
306 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
307 /// @param scale - value to scale indices by
308 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
309 {
310 Value* vGather;
311
312 // use avx2 gather instruction if available
313 if (JM()->mArch.AVX2())
314 {
315 vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
316 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
317 }
318 else
319 {
320 Value* pStack = STACKSAVE();
321
322 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
323 Value* vSrcPtr = ALLOCA(vSrc->getType());
324 STORE(vSrc, vSrcPtr);
325
326 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
327 Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
328 Value *vOffsets = MUL(vIndices, vScaleVec);
329 for (uint32_t i = 0; i < mVWidth / 2; ++i)
330 {
331 // single component byte index
332 Value *offset = VEXTRACT(vOffsets, C(i));
333 // byte pointer to component
334 Value *loadAddress = GEP(pBase, offset);
335 loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
336 // pointer to the value to load if we're masking off a component
337 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
338 Value *selMask = VEXTRACT(vMask, C(i));
339 // switch in a safe address to load if we're trying to access a vertex
340 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
341 Value *val = LOAD(validAddress);
342 vGather = VINSERT(vGather, val, C(i));
343 }
344 STACKRESTORE(pStack);
345 }
346 return vGather;
347 }
348
349 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
350 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
351 {
352 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
353 if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
354 {
355 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
356 }
357 else
358 {
359 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
360 }
361 }
362
363 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
364 Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
365 {
366 switch (info.bpp / info.numComps)
367 {
368 case 16:
369 {
370 Value* vGatherResult[2];
371
372 // TODO: vGatherMaskedVal
373 Value* vGatherMaskedVal = VIMMED1((float)0);
374
375 // always have at least one component out of x or y to fetch
376
377 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
378 // e.g. result of first 8x32bit integer gather for 16bit components
379 // 256i - 0 1 2 3 4 5 6 7
380 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
381 //
382
383 // if we have at least one component out of x or y to fetch
384 if (info.numComps > 2)
385 {
386 // offset base to the next components(zw) in the vertex to gather
387 pSrcBase = GEP(pSrcBase, C((char)4));
388
389 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
390 // e.g. result of second 8x32bit integer gather for 16bit components
391 // 256i - 0 1 2 3 4 5 6 7
392 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
393 //
394 }
395 else
396 {
397 vGatherResult[1] = vGatherMaskedVal;
398 }
399
400 // Shuffle gathered components into place, each row is a component
401 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
402 }
403 break;
404 case 32:
405 {
406 // apply defaults
407 for (uint32_t i = 0; i < 4; ++i)
408 {
409 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
410 }
411
412 for (uint32_t i = 0; i < info.numComps; i++)
413 {
414 uint32_t swizzleIndex = info.swizzle[i];
415
416 // Gather a SIMD of components
417 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
418
419 // offset base to the next component to gather
420 pSrcBase = GEP(pSrcBase, C((char)4));
421 }
422 }
423 break;
424 default:
425 SWR_INVALID("Invalid float format");
426 break;
427 }
428 }
429
430 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
431 Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
432 {
433 switch (info.bpp / info.numComps)
434 {
435 case 8:
436 {
437 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
438 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
439 // e.g. result of an 8x32bit integer gather for 8bit components
440 // 256i - 0 1 2 3 4 5 6 7
441 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
442
443 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
444 }
445 break;
446 case 16:
447 {
448 Value* vGatherResult[2];
449
450 // TODO: vGatherMaskedVal
451 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
452
453 // always have at least one component out of x or y to fetch
454
455 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
456 // e.g. result of first 8x32bit integer gather for 16bit components
457 // 256i - 0 1 2 3 4 5 6 7
458 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
459 //
460
461 // if we have at least one component out of x or y to fetch
462 if (info.numComps > 2)
463 {
464 // offset base to the next components(zw) in the vertex to gather
465 pSrcBase = GEP(pSrcBase, C((char)4));
466
467 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
468 // e.g. result of second 8x32bit integer gather for 16bit components
469 // 256i - 0 1 2 3 4 5 6 7
470 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
471 //
472 }
473 else
474 {
475 vGatherResult[1] = vGatherMaskedVal;
476 }
477
478 // Shuffle gathered components into place, each row is a component
479 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
480
481 }
482 break;
483 case 32:
484 {
485 // apply defaults
486 for (uint32_t i = 0; i < 4; ++i)
487 {
488 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
489 }
490
491 for (uint32_t i = 0; i < info.numComps; i++)
492 {
493 uint32_t swizzleIndex = info.swizzle[i];
494
495 // Gather a SIMD of components
496 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
497
498 // offset base to the next component to gather
499 pSrcBase = GEP(pSrcBase, C((char)4));
500 }
501 }
502 break;
503 default:
504 SWR_INVALID("unsupported format");
505 break;
506 }
507 }
508
509 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
510 {
511 // cast types
512 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
513 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
514
515 // input could either be float or int vector; do shuffle work in int
516 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
517 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
518
519 if (bPackedOutput)
520 {
521 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
522
523 // shuffle mask
524 Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
525 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
526 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
527 // after pshufb: group components together in each 128bit lane
528 // 256i - 0 1 2 3 4 5 6 7
529 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
530
531 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
532 // after PERMD: move and pack xy components into each 128bit lane
533 // 256i - 0 1 2 3 4 5 6 7
534 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
535
536 // do the same for zw components
537 Value* vi128ZW = nullptr;
538 if (info.numComps > 2)
539 {
540 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
541 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
542 }
543
544 for (uint32_t i = 0; i < 4; i++)
545 {
546 uint32_t swizzleIndex = info.swizzle[i];
547 // todo: fixed for packed
548 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
549 if (i >= info.numComps)
550 {
551 // set the default component val
552 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
553 continue;
554 }
555
556 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
557 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
558 // if x or y, use vi128XY permute result, else use vi128ZW
559 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
560
561 // extract packed component 128 bit lanes
562 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
563 }
564
565 }
566 else
567 {
568 // pshufb masks for each component
569 Value* vConstMask[2];
570 // x/z shuffle mask
571 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
572 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
573
574 // y/w shuffle mask
575 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
576 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
577
578
579 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
580 // apply defaults
581 for (uint32_t i = 0; i < 4; ++i)
582 {
583 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
584 }
585
586 for (uint32_t i = 0; i < info.numComps; i++)
587 {
588 uint32_t swizzleIndex = info.swizzle[i];
589
590 // select correct constMask for x/z or y/w pshufb
591 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
592 // if x or y, use vi128XY permute result, else use vi128ZW
593 uint32_t selectedGather = (i < 2) ? 0 : 1;
594
595 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
596 // after pshufb mask for x channel; z uses the same shuffle from the second gather
597 // 256i - 0 1 2 3 4 5 6 7
598 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
599 }
600 }
601 }
602
603 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
604 {
605 // cast types
606 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
607 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
608
609 if (bPackedOutput)
610 {
611 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
612 // shuffle mask
613 Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
614 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
615 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
616 // after pshufb: group components together in each 128bit lane
617 // 256i - 0 1 2 3 4 5 6 7
618 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
619
620 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
621 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
622 // 256i - 0 1 2 3 4 5 6 7
623 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
624
625 // do the same for zw components
626 Value* vi128ZW = nullptr;
627 if (info.numComps > 2)
628 {
629 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
630 }
631
632 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
633 for (uint32_t i = 0; i < 4; i++)
634 {
635 uint32_t swizzleIndex = info.swizzle[i];
636 // todo: fix for packed
637 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
638 if (i >= info.numComps)
639 {
640 // set the default component val
641 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
642 continue;
643 }
644
645 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
646 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
647 // if x or y, use vi128XY permute result, else use vi128ZW
648 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
649
650 // sign extend
651 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
652 }
653 }
654 // else zero extend
655 else {
656 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
657 // apply defaults
658 for (uint32_t i = 0; i < 4; ++i)
659 {
660 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
661 }
662
663 for (uint32_t i = 0; i < info.numComps; i++) {
664 uint32_t swizzleIndex = info.swizzle[i];
665
666 // pshufb masks for each component
667 Value* vConstMask;
668 switch (i)
669 {
670 case 0:
671 // x shuffle mask
672 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
673 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
674 break;
675 case 1:
676 // y shuffle mask
677 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
678 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
679 break;
680 case 2:
681 // z shuffle mask
682 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
683 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
684 break;
685 case 3:
686 // w shuffle mask
687 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
688 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
689 break;
690 default:
691 vConstMask = nullptr;
692 break;
693 }
694
695 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
696 // after pshufb for x channel
697 // 256i - 0 1 2 3 4 5 6 7
698 // x000 x000 x000 x000 x000 x000 x000 x000
699 }
700 }
701 }
702
703 //////////////////////////////////////////////////////////////////////////
704 /// @brief emulates a scatter operation.
705 /// @param pDst - pointer to destination
706 /// @param vSrc - vector of src data to scatter
707 /// @param vOffsets - vector of byte offsets from pDst
708 /// @param vMask - mask of valid lanes
709 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
710 {
711 /* Scatter algorithm
712
713 while(Index = BitScanForward(mask))
714 srcElem = srcVector[Index]
715 offsetElem = offsetVector[Index]
716 *(pDst + offsetElem) = srcElem
717 Update mask (&= ~(1<<Index)
718
719 */
720
721 BasicBlock* pCurBB = IRB()->GetInsertBlock();
722 Function* pFunc = pCurBB->getParent();
723 Type* pSrcTy = vSrc->getType()->getVectorElementType();
724
725 // Store vectors on stack
726 if (pScatterStackSrc == nullptr)
727 {
728 // Save off stack allocations and reuse per scatter. Significantly reduces stack
729 // requirements for shaders with a lot of scatters.
730 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
731 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
732 }
733
734 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
735 Value* pOffsetsArrayPtr = pScatterStackOffsets;
736 STORE(vSrc, pSrcArrayPtr);
737 STORE(vOffsets, pOffsetsArrayPtr);
738
739 // Cast to pointers for random access
740 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
741 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
742
743 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
744
745 // Get cttz function
746 Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
747
748 // Setup loop basic block
749 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
750
751 // compute first set bit
752 Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
753
754 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
755
756 // Split current block
757 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
758
759 // Remove unconditional jump created by splitBasicBlock
760 pCurBB->getTerminator()->eraseFromParent();
761
762 // Add terminator to end of original block
763 IRB()->SetInsertPoint(pCurBB);
764
765 // Add conditional branch
766 COND_BR(pIsUndef, pPostLoop, pLoop);
767
768 // Add loop basic block contents
769 IRB()->SetInsertPoint(pLoop);
770 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
771 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
772
773 pIndexPhi->addIncoming(pIndex, pCurBB);
774 pMaskPhi->addIncoming(pMask, pCurBB);
775
776 // Extract elements for this index
777 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
778 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
779
780 // GEP to this offset in dst
781 Value* pCurDst = GEP(pDst, pOffsetElem);
782 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
783 STORE(pSrcElem, pCurDst);
784
785 // Update the mask
786 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
787
788 // Terminator
789 Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
790
791 pIsUndef = ICMP_EQ(pNewIndex, C(32));
792 COND_BR(pIsUndef, pPostLoop, pLoop);
793
794 // Update phi edges
795 pIndexPhi->addIncoming(pNewIndex, pLoop);
796 pMaskPhi->addIncoming(pNewMask, pLoop);
797
798 // Move builder to beginning of post loop
799 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
800 }
801
802 //////////////////////////////////////////////////////////////////////////
803 /// @brief save/restore stack, providing ability to push/pop the stack and
804 /// reduce overall stack requirements for temporary stack use
805 Value* Builder::STACKSAVE()
806 {
807 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
808 return CALLA(pfnStackSave);
809 }
810
811 void Builder::STACKRESTORE(Value* pSaved)
812 {
813 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
814 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
815 }
816
817 }