065d7fa0afdc996e66e17d948372d63ce6275163
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_mem.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32
33 #include <cstdarg>
34
35 namespace SwrJit
36 {
37 void Builder::AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage)
38 {
39 SWR_ASSERT(
40 ptr->getType() != mInt64Ty,
41 "Address appears to be GFX access. Requires translation through BuilderGfxMem.");
42 }
43
44 Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name)
45 {
46 return IRB()->CreateGEP(Ptr, Idx, Name);
47 }
48
49 Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
50 {
51 return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
52 }
53
54 Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
55 {
56 std::vector<Value*> indices;
57 for (auto i : indexList)
58 indices.push_back(i);
59 return GEPA(ptr, indices);
60 }
61
62 Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
63 {
64 std::vector<Value*> indices;
65 for (auto i : indexList)
66 indices.push_back(C(i));
67 return GEPA(ptr, indices);
68 }
69
70 Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
71 {
72 return IRB()->CreateGEP(Ptr, IdxList, Name);
73 }
74
75 Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
76 {
77 return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
78 }
79
80 Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
81 {
82 std::vector<Value*> indices;
83 for (auto i : indexList)
84 indices.push_back(i);
85 return IN_BOUNDS_GEP(ptr, indices);
86 }
87
88 Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
89 {
90 std::vector<Value*> indices;
91 for (auto i : indexList)
92 indices.push_back(C(i));
93 return IN_BOUNDS_GEP(ptr, indices);
94 }
95
96 LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage)
97 {
98 AssertMemoryUsageParams(Ptr, usage);
99 return IRB()->CreateLoad(Ptr, Name);
100 }
101
102 LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage)
103 {
104 AssertMemoryUsageParams(Ptr, usage);
105 return IRB()->CreateLoad(Ptr, Name);
106 }
107
108 LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, MEM_CLIENT usage)
109 {
110 AssertMemoryUsageParams(Ptr, usage);
111 return IRB()->CreateLoad(Ty, Ptr, Name);
112 }
113
114 LoadInst*
115 Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage)
116 {
117 AssertMemoryUsageParams(Ptr, usage);
118 return IRB()->CreateLoad(Ptr, isVolatile, Name);
119 }
120
121 LoadInst* Builder::LOAD(Value* basePtr,
122 const std::initializer_list<uint32_t>& indices,
123 const llvm::Twine& name,
124 Type* Ty,
125 MEM_CLIENT usage)
126 {
127 std::vector<Value*> valIndices;
128 for (auto i : indices)
129 valIndices.push_back(C(i));
130 return Builder::LOAD(GEPA(basePtr, valIndices), name);
131 }
132
133 LoadInst* Builder::LOADV(Value* basePtr,
134 const std::initializer_list<Value*>& indices,
135 const llvm::Twine& name)
136 {
137 std::vector<Value*> valIndices;
138 for (auto i : indices)
139 valIndices.push_back(i);
140 return LOAD(GEPA(basePtr, valIndices), name);
141 }
142
143 StoreInst*
144 Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, MEM_CLIENT usage)
145 {
146 std::vector<Value*> valIndices;
147 for (auto i : indices)
148 valIndices.push_back(C(i));
149 return STORE(val, GEPA(basePtr, valIndices));
150 }
151
152 StoreInst*
153 Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
154 {
155 std::vector<Value*> valIndices;
156 for (auto i : indices)
157 valIndices.push_back(i);
158 return STORE(val, GEPA(basePtr, valIndices));
159 }
160
161 Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
162 {
163 return GEP(base, offset);
164 }
165
166 Value* Builder::MEM_ADD(Value* i32Incr,
167 Value* basePtr,
168 const std::initializer_list<uint32_t>& indices,
169 const llvm::Twine& name)
170 {
171 Value* i32Value = LOAD(GEP(basePtr, indices), name);
172 Value* i32Result = ADD(i32Value, i32Incr);
173 return STORE(i32Result, GEP(basePtr, indices));
174 }
175
176 //////////////////////////////////////////////////////////////////////////
177 /// @brief Generate a masked gather operation in LLVM IR. If not
178 /// supported on the underlying platform, emulate it with loads
179 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
180 /// @param pBase - Int8* base VB address pointer value
181 /// @param vIndices - SIMD wide value of VB byte offsets
182 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
183 /// @param scale - value to scale indices by
184 Value* Builder::GATHERPS(Value* vSrc,
185 Value* pBase,
186 Value* vIndices,
187 Value* vMask,
188 uint8_t scale,
189 MEM_CLIENT usage)
190 {
191 AssertMemoryUsageParams(pBase, usage);
192
193 return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
194 }
195
196 //////////////////////////////////////////////////////////////////////////
197 /// @brief Generate a masked gather operation in LLVM IR. If not
198 /// supported on the underlying platform, emulate it with loads
199 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
200 /// @param pBase - Int8* base VB address pointer value
201 /// @param vIndices - SIMD wide value of VB byte offsets
202 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
203 /// @param scale - value to scale indices by
204 Value* Builder::GATHERDD(Value* vSrc,
205 Value* pBase,
206 Value* vIndices,
207 Value* vMask,
208 uint8_t scale,
209 MEM_CLIENT usage)
210 {
211 AssertMemoryUsageParams(pBase, usage);
212
213 return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
214 }
215
216 //////////////////////////////////////////////////////////////////////////
217 /// @brief Generate a masked gather operation in LLVM IR. If not
218 /// supported on the underlying platform, emulate it with loads
219 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
220 /// @param pBase - Int8* base VB address pointer value
221 /// @param vIndices - SIMD wide value of VB byte offsets
222 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
223 /// @param scale - value to scale indices by
224 Value*
225 Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
226 {
227 return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
228 }
229
230 //////////////////////////////////////////////////////////////////////////
231 /// @brief Alternative masked gather where source is a vector of pointers
232 /// @param pVecSrcPtr - SIMD wide vector of pointers
233 /// @param pVecMask - SIMD active lanes
234 /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
235 Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
236 {
237 return MASKED_GATHER(pVecSrcPtr, AlignType(4), pVecMask, pVecPassthru);
238 }
239
240 void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask)
241 {
242 MASKED_SCATTER(pVecSrc, pVecDstPtr, AlignType(4), pVecMask);
243 }
244
245 void Builder::Gather4(const SWR_FORMAT format,
246 Value* pSrcBase,
247 Value* byteOffsets,
248 Value* mask,
249 Value* vGatherComponents[],
250 bool bPackedOutput,
251 MEM_CLIENT usage)
252 {
253 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
254 if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
255 {
256 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
257 }
258 else
259 {
260 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
261 }
262 }
263
264 void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
265 Value* pSrcBase,
266 Value* byteOffsets,
267 Value* vMask,
268 Value* vGatherComponents[],
269 bool bPackedOutput,
270 MEM_CLIENT usage)
271 {
272 switch (info.bpp / info.numComps)
273 {
274 case 16:
275 {
276 Value* vGatherResult[2];
277
278 // TODO: vGatherMaskedVal
279 Value* vGatherMaskedVal = VIMMED1((float)0);
280
281 // always have at least one component out of x or y to fetch
282
283 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
284 // e.g. result of first 8x32bit integer gather for 16bit components
285 // 256i - 0 1 2 3 4 5 6 7
286 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
287 //
288
289 // if we have at least one component out of x or y to fetch
290 if (info.numComps > 2)
291 {
292 // offset base to the next components(zw) in the vertex to gather
293 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
294
295 vGatherResult[1] =
296 GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
297 // e.g. result of second 8x32bit integer gather for 16bit components
298 // 256i - 0 1 2 3 4 5 6 7
299 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
300 //
301 }
302 else
303 {
304 vGatherResult[1] = vGatherMaskedVal;
305 }
306
307 // Shuffle gathered components into place, each row is a component
308 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
309 }
310 break;
311 case 32:
312 {
313 // apply defaults
314 for (uint32_t i = 0; i < 4; ++i)
315 {
316 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
317 }
318
319 for (uint32_t i = 0; i < info.numComps; i++)
320 {
321 uint32_t swizzleIndex = info.swizzle[i];
322
323 // Gather a SIMD of components
324 vGatherComponents[swizzleIndex] = GATHERPS(
325 vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
326
327 // offset base to the next component to gather
328 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
329 }
330 }
331 break;
332 default:
333 SWR_INVALID("Invalid float format");
334 break;
335 }
336 }
337
338 void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
339 Value* pSrcBase,
340 Value* byteOffsets,
341 Value* vMask,
342 Value* vGatherComponents[],
343 bool bPackedOutput,
344 MEM_CLIENT usage)
345 {
346 switch (info.bpp / info.numComps)
347 {
348 case 8:
349 {
350 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
351 Value* vGatherResult =
352 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
353 // e.g. result of an 8x32bit integer gather for 8bit components
354 // 256i - 0 1 2 3 4 5 6 7
355 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
356
357 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
358 }
359 break;
360 case 16:
361 {
362 Value* vGatherResult[2];
363
364 // TODO: vGatherMaskedVal
365 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
366
367 // always have at least one component out of x or y to fetch
368
369 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
370 // e.g. result of first 8x32bit integer gather for 16bit components
371 // 256i - 0 1 2 3 4 5 6 7
372 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
373 //
374
375 // if we have at least one component out of x or y to fetch
376 if (info.numComps > 2)
377 {
378 // offset base to the next components(zw) in the vertex to gather
379 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
380
381 vGatherResult[1] =
382 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
383 // e.g. result of second 8x32bit integer gather for 16bit components
384 // 256i - 0 1 2 3 4 5 6 7
385 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
386 //
387 }
388 else
389 {
390 vGatherResult[1] = vGatherMaskedVal;
391 }
392
393 // Shuffle gathered components into place, each row is a component
394 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
395 }
396 break;
397 case 32:
398 {
399 // apply defaults
400 for (uint32_t i = 0; i < 4; ++i)
401 {
402 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
403 }
404
405 for (uint32_t i = 0; i < info.numComps; i++)
406 {
407 uint32_t swizzleIndex = info.swizzle[i];
408
409 // Gather a SIMD of components
410 vGatherComponents[swizzleIndex] = GATHERDD(
411 vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
412
413 // offset base to the next component to gather
414 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
415 }
416 }
417 break;
418 default:
419 SWR_INVALID("unsupported format");
420 break;
421 }
422 }
423
424 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
425 Value* vGatherInput[2],
426 Value* vGatherOutput[4],
427 bool bPackedOutput)
428 {
429 // cast types
430 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
431 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
432
433 // input could either be float or int vector; do shuffle work in int
434 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
435 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
436
437 if (bPackedOutput)
438 {
439 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
440 mVWidth / 4); // vwidth is units of 32 bits
441
442 // shuffle mask
443 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
444 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
445 Value* vShufResult =
446 BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
447 // after pshufb: group components together in each 128bit lane
448 // 256i - 0 1 2 3 4 5 6 7
449 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
450
451 Value* vi128XY =
452 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
453 // after PERMD: move and pack xy components into each 128bit lane
454 // 256i - 0 1 2 3 4 5 6 7
455 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
456
457 // do the same for zw components
458 Value* vi128ZW = nullptr;
459 if (info.numComps > 2)
460 {
461 Value* vShufResult =
462 BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
463 vi128ZW =
464 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
465 }
466
467 for (uint32_t i = 0; i < 4; i++)
468 {
469 uint32_t swizzleIndex = info.swizzle[i];
470 // todo: fixed for packed
471 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
472 if (i >= info.numComps)
473 {
474 // set the default component val
475 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
476 continue;
477 }
478
479 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
480 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
481 // if x or y, use vi128XY permute result, else use vi128ZW
482 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
483
484 // extract packed component 128 bit lanes
485 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
486 }
487 }
488 else
489 {
490 // pshufb masks for each component
491 Value* vConstMask[2];
492 // x/z shuffle mask
493 vConstMask[0] = C<char>({
494 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
495 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
496 });
497
498 // y/w shuffle mask
499 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
500 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
501
502 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
503 // apply defaults
504 for (uint32_t i = 0; i < 4; ++i)
505 {
506 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
507 }
508
509 for (uint32_t i = 0; i < info.numComps; i++)
510 {
511 uint32_t swizzleIndex = info.swizzle[i];
512
513 // select correct constMask for x/z or y/w pshufb
514 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
515 // if x or y, use vi128XY permute result, else use vi128ZW
516 uint32_t selectedGather = (i < 2) ? 0 : 1;
517
518 vGatherOutput[swizzleIndex] =
519 BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
520 vConstMask[selectedMask]),
521 vGatherTy);
522 // after pshufb mask for x channel; z uses the same shuffle from the second gather
523 // 256i - 0 1 2 3 4 5 6 7
524 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
525 }
526 }
527 }
528
529 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
530 Value* vGatherInput,
531 Value* vGatherOutput[],
532 bool bPackedOutput)
533 {
534 // cast types
535 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
536 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
537
538 if (bPackedOutput)
539 {
540 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
541 mVWidth / 4); // vwidth is units of 32 bits
542 // shuffle mask
543 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
544 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
545 Value* vShufResult =
546 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
547 // after pshufb: group components together in each 128bit lane
548 // 256i - 0 1 2 3 4 5 6 7
549 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
550
551 Value* vi128XY =
552 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
553 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
554 // 256i - 0 1 2 3 4 5 6 7
555 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
556
557 // do the same for zw components
558 Value* vi128ZW = nullptr;
559 if (info.numComps > 2)
560 {
561 vi128ZW =
562 BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
563 }
564
565 // sign extend all enabled components. If we have a fill vVertexElements, output to
566 // current simdvertex
567 for (uint32_t i = 0; i < 4; i++)
568 {
569 uint32_t swizzleIndex = info.swizzle[i];
570 // todo: fix for packed
571 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
572 if (i >= info.numComps)
573 {
574 // set the default component val
575 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
576 continue;
577 }
578
579 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
580 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
581 // if x or y, use vi128XY permute result, else use vi128ZW
582 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
583
584 // sign extend
585 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
586 }
587 }
588 // else zero extend
589 else
590 {
591 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
592 // apply defaults
593 for (uint32_t i = 0; i < 4; ++i)
594 {
595 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
596 }
597
598 for (uint32_t i = 0; i < info.numComps; i++)
599 {
600 uint32_t swizzleIndex = info.swizzle[i];
601
602 // pshufb masks for each component
603 Value* vConstMask;
604 switch (i)
605 {
606 case 0:
607 // x shuffle mask
608 vConstMask =
609 C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
610 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
611 break;
612 case 1:
613 // y shuffle mask
614 vConstMask =
615 C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
616 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
617 break;
618 case 2:
619 // z shuffle mask
620 vConstMask =
621 C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
622 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
623 break;
624 case 3:
625 // w shuffle mask
626 vConstMask =
627 C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
628 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
629 break;
630 default:
631 vConstMask = nullptr;
632 break;
633 }
634
635 assert(vConstMask && "Invalid info.numComps value");
636 vGatherOutput[swizzleIndex] =
637 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
638 // after pshufb for x channel
639 // 256i - 0 1 2 3 4 5 6 7
640 // x000 x000 x000 x000 x000 x000 x000 x000
641 }
642 }
643 }
644
645 //////////////////////////////////////////////////////////////////////////
646 /// @brief emulates a scatter operation.
647 /// @param pDst - pointer to destination
648 /// @param vSrc - vector of src data to scatter
649 /// @param vOffsets - vector of byte offsets from pDst
650 /// @param vMask - mask of valid lanes
651 void Builder::SCATTERPS(
652 Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage)
653 {
654 AssertMemoryUsageParams(pDst, usage);
655 #if LLVM_VERSION_MAJOR >= 11
656 SWR_ASSERT(cast<VectorType>(vSrc->getType())->getElementType()->isFloatTy());
657 #else
658 SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy());
659 #endif
660 VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1));
661 return;
662
663 /* Scatter algorithm
664
665 while(Index = BitScanForward(mask))
666 srcElem = srcVector[Index]
667 offsetElem = offsetVector[Index]
668 *(pDst + offsetElem) = srcElem
669 Update mask (&= ~(1<<Index)
670
671 */
672
673 /*
674
675 // Reference implementation kept around for reference
676
677 BasicBlock* pCurBB = IRB()->GetInsertBlock();
678 Function* pFunc = pCurBB->getParent();
679 Type* pSrcTy = vSrc->getType()->getVectorElementType();
680
681 // Store vectors on stack
682 if (pScatterStackSrc == nullptr)
683 {
684 // Save off stack allocations and reuse per scatter. Significantly reduces stack
685 // requirements for shaders with a lot of scatters.
686 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
687 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
688 }
689
690 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
691 Value* pOffsetsArrayPtr = pScatterStackOffsets;
692 STORE(vSrc, pSrcArrayPtr);
693 STORE(vOffsets, pOffsetsArrayPtr);
694
695 // Cast to pointers for random access
696 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
697 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
698
699 Value* pMask = VMOVMSK(vMask);
700
701 // Setup loop basic block
702 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
703
704 // compute first set bit
705 Value* pIndex = CTTZ(pMask, C(false));
706
707 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
708
709 // Split current block or create new one if building inline
710 BasicBlock* pPostLoop;
711 if (pCurBB->getTerminator())
712 {
713 pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
714
715 // Remove unconditional jump created by splitBasicBlock
716 pCurBB->getTerminator()->eraseFromParent();
717
718 // Add terminator to end of original block
719 IRB()->SetInsertPoint(pCurBB);
720
721 // Add conditional branch
722 COND_BR(pIsUndef, pPostLoop, pLoop);
723 }
724 else
725 {
726 pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
727
728 // Add conditional branch
729 COND_BR(pIsUndef, pPostLoop, pLoop);
730 }
731
732 // Add loop basic block contents
733 IRB()->SetInsertPoint(pLoop);
734 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
735 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
736
737 pIndexPhi->addIncoming(pIndex, pCurBB);
738 pMaskPhi->addIncoming(pMask, pCurBB);
739
740 // Extract elements for this index
741 Value* pSrcElem = LOADV(pSrcArrayPtr, {pIndexPhi});
742 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
743
744 // GEP to this offset in dst
745 Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
746 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
747 STORE(pSrcElem, pCurDst);
748
749 // Update the mask
750 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
751
752 // Terminator
753 Value* pNewIndex = CTTZ(pNewMask, C(false));
754
755 pIsUndef = ICMP_EQ(pNewIndex, C(32));
756 COND_BR(pIsUndef, pPostLoop, pLoop);
757
758 // Update phi edges
759 pIndexPhi->addIncoming(pNewIndex, pLoop);
760 pMaskPhi->addIncoming(pNewMask, pLoop);
761
762 // Move builder to beginning of post loop
763 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
764
765 */
766 }
767 } // namespace SwrJit