swr/rast: Fix addPassesToEmitFile usage with llvm-7.0.
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_mem.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "common/rdtsc_buckets.h"
33
34 #include <cstdarg>
35
36 namespace SwrJit
37 {
38 void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
39 {
40 SWR_ASSERT(
41 ptr->getType() != mInt64Ty,
42 "Address appears to be GFX access. Requires translation through BuilderGfxMem.");
43 }
44
45 Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name)
46 {
47 return IRB()->CreateGEP(Ptr, Idx, Name);
48 }
49
50 Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
51 {
52 return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
53 }
54
55 Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
56 {
57 std::vector<Value*> indices;
58 for (auto i : indexList)
59 indices.push_back(i);
60 return GEPA(ptr, indices);
61 }
62
63 Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
64 {
65 std::vector<Value*> indices;
66 for (auto i : indexList)
67 indices.push_back(C(i));
68 return GEPA(ptr, indices);
69 }
70
71 Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
72 {
73 return IRB()->CreateGEP(Ptr, IdxList, Name);
74 }
75
76 Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
77 {
78 return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
79 }
80
81 Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
82 {
83 std::vector<Value*> indices;
84 for (auto i : indexList)
85 indices.push_back(i);
86 return IN_BOUNDS_GEP(ptr, indices);
87 }
88
89 Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
90 {
91 std::vector<Value*> indices;
92 for (auto i : indexList)
93 indices.push_back(C(i));
94 return IN_BOUNDS_GEP(ptr, indices);
95 }
96
97 LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage)
98 {
99 AssertMemoryUsageParams(Ptr, usage);
100 return IRB()->CreateLoad(Ptr, Name);
101 }
102
103 LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
104 {
105 AssertMemoryUsageParams(Ptr, usage);
106 return IRB()->CreateLoad(Ptr, Name);
107 }
108
109 LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, JIT_MEM_CLIENT usage)
110 {
111 AssertMemoryUsageParams(Ptr, usage);
112 return IRB()->CreateLoad(Ty, Ptr, Name);
113 }
114
115 LoadInst*
116 Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
117 {
118 AssertMemoryUsageParams(Ptr, usage);
119 return IRB()->CreateLoad(Ptr, isVolatile, Name);
120 }
121
122 LoadInst* Builder::LOAD(Value* basePtr,
123 const std::initializer_list<uint32_t>& indices,
124 const llvm::Twine& name,
125 Type* Ty,
126 JIT_MEM_CLIENT usage)
127 {
128 std::vector<Value*> valIndices;
129 for (auto i : indices)
130 valIndices.push_back(C(i));
131 return Builder::LOAD(GEPA(basePtr, valIndices), name);
132 }
133
134 LoadInst* Builder::LOADV(Value* basePtr,
135 const std::initializer_list<Value*>& indices,
136 const llvm::Twine& name)
137 {
138 std::vector<Value*> valIndices;
139 for (auto i : indices)
140 valIndices.push_back(i);
141 return LOAD(GEPA(basePtr, valIndices), name);
142 }
143
144 StoreInst*
145 Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices)
146 {
147 std::vector<Value*> valIndices;
148 for (auto i : indices)
149 valIndices.push_back(C(i));
150 return STORE(val, GEPA(basePtr, valIndices));
151 }
152
153 StoreInst*
154 Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
155 {
156 std::vector<Value*> valIndices;
157 for (auto i : indices)
158 valIndices.push_back(i);
159 return STORE(val, GEPA(basePtr, valIndices));
160 }
161
162 Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
163 {
164 return GEP(base, offset);
165 }
166
167 Value* Builder::MEM_ADD(Value* i32Incr,
168 Value* basePtr,
169 const std::initializer_list<uint32_t>& indices,
170 const llvm::Twine& name)
171 {
172 Value* i32Value = LOAD(GEP(basePtr, indices), name);
173 Value* i32Result = ADD(i32Value, i32Incr);
174 return STORE(i32Result, GEP(basePtr, indices));
175 }
176
177 //////////////////////////////////////////////////////////////////////////
178 /// @brief Generate a masked gather operation in LLVM IR. If not
179 /// supported on the underlying platform, emulate it with loads
180 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
181 /// @param pBase - Int8* base VB address pointer value
182 /// @param vIndices - SIMD wide value of VB byte offsets
183 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
184 /// @param scale - value to scale indices by
185 Value* Builder::GATHERPS(Value* vSrc,
186 Value* pBase,
187 Value* vIndices,
188 Value* vMask,
189 uint8_t scale,
190 JIT_MEM_CLIENT usage)
191 {
192 AssertMemoryUsageParams(pBase, usage);
193
194 return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
195 }
196
197 //////////////////////////////////////////////////////////////////////////
198 /// @brief Generate a masked gather operation in LLVM IR. If not
199 /// supported on the underlying platform, emulate it with loads
200 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
201 /// @param pBase - Int8* base VB address pointer value
202 /// @param vIndices - SIMD wide value of VB byte offsets
203 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
204 /// @param scale - value to scale indices by
205 Value* Builder::GATHERDD(Value* vSrc,
206 Value* pBase,
207 Value* vIndices,
208 Value* vMask,
209 uint8_t scale,
210 JIT_MEM_CLIENT usage)
211 {
212 AssertMemoryUsageParams(pBase, usage);
213
214 return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
215 }
216
217 //////////////////////////////////////////////////////////////////////////
218 /// @brief Generate a masked gather operation in LLVM IR. If not
219 /// supported on the underlying platform, emulate it with loads
220 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
221 /// @param pBase - Int8* base VB address pointer value
222 /// @param vIndices - SIMD wide value of VB byte offsets
223 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
224 /// @param scale - value to scale indices by
225 Value*
226 Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
227 {
228 return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
229 }
230
231 //////////////////////////////////////////////////////////////////////////
232 /// @brief Alternative masked gather where source is a vector of pointers
233 /// @param pVecSrcPtr - SIMD wide vector of pointers
234 /// @param pVecMask - SIMD active lanes
235 /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
236 Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
237 {
238 return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
239 }
240
241 void Builder::Gather4(const SWR_FORMAT format,
242 Value* pSrcBase,
243 Value* byteOffsets,
244 Value* mask,
245 Value* vGatherComponents[],
246 bool bPackedOutput,
247 JIT_MEM_CLIENT usage)
248 {
249 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
250 if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
251 {
252 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
253 }
254 else
255 {
256 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
257 }
258 }
259
260 void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
261 Value* pSrcBase,
262 Value* byteOffsets,
263 Value* vMask,
264 Value* vGatherComponents[],
265 bool bPackedOutput,
266 JIT_MEM_CLIENT usage)
267 {
268 switch (info.bpp / info.numComps)
269 {
270 case 16:
271 {
272 Value* vGatherResult[2];
273
274 // TODO: vGatherMaskedVal
275 Value* vGatherMaskedVal = VIMMED1((float)0);
276
277 // always have at least one component out of x or y to fetch
278
279 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
280 // e.g. result of first 8x32bit integer gather for 16bit components
281 // 256i - 0 1 2 3 4 5 6 7
282 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
283 //
284
285 // if we have at least one component out of x or y to fetch
286 if (info.numComps > 2)
287 {
288 // offset base to the next components(zw) in the vertex to gather
289 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
290
291 vGatherResult[1] =
292 GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
293 // e.g. result of second 8x32bit integer gather for 16bit components
294 // 256i - 0 1 2 3 4 5 6 7
295 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
296 //
297 }
298 else
299 {
300 vGatherResult[1] = vGatherMaskedVal;
301 }
302
303 // Shuffle gathered components into place, each row is a component
304 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
305 }
306 break;
307 case 32:
308 {
309 // apply defaults
310 for (uint32_t i = 0; i < 4; ++i)
311 {
312 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
313 }
314
315 for (uint32_t i = 0; i < info.numComps; i++)
316 {
317 uint32_t swizzleIndex = info.swizzle[i];
318
319 // Gather a SIMD of components
320 vGatherComponents[swizzleIndex] = GATHERPS(
321 vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
322
323 // offset base to the next component to gather
324 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
325 }
326 }
327 break;
328 default:
329 SWR_INVALID("Invalid float format");
330 break;
331 }
332 }
333
334 void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
335 Value* pSrcBase,
336 Value* byteOffsets,
337 Value* vMask,
338 Value* vGatherComponents[],
339 bool bPackedOutput,
340 JIT_MEM_CLIENT usage)
341 {
342 switch (info.bpp / info.numComps)
343 {
344 case 8:
345 {
346 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
347 Value* vGatherResult =
348 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
349 // e.g. result of an 8x32bit integer gather for 8bit components
350 // 256i - 0 1 2 3 4 5 6 7
351 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
352
353 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
354 }
355 break;
356 case 16:
357 {
358 Value* vGatherResult[2];
359
360 // TODO: vGatherMaskedVal
361 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
362
363 // always have at least one component out of x or y to fetch
364
365 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
366 // e.g. result of first 8x32bit integer gather for 16bit components
367 // 256i - 0 1 2 3 4 5 6 7
368 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
369 //
370
371 // if we have at least one component out of x or y to fetch
372 if (info.numComps > 2)
373 {
374 // offset base to the next components(zw) in the vertex to gather
375 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
376
377 vGatherResult[1] =
378 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
379 // e.g. result of second 8x32bit integer gather for 16bit components
380 // 256i - 0 1 2 3 4 5 6 7
381 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
382 //
383 }
384 else
385 {
386 vGatherResult[1] = vGatherMaskedVal;
387 }
388
389 // Shuffle gathered components into place, each row is a component
390 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
391 }
392 break;
393 case 32:
394 {
395 // apply defaults
396 for (uint32_t i = 0; i < 4; ++i)
397 {
398 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
399 }
400
401 for (uint32_t i = 0; i < info.numComps; i++)
402 {
403 uint32_t swizzleIndex = info.swizzle[i];
404
405 // Gather a SIMD of components
406 vGatherComponents[swizzleIndex] = GATHERDD(
407 vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
408
409 // offset base to the next component to gather
410 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
411 }
412 }
413 break;
414 default:
415 SWR_INVALID("unsupported format");
416 break;
417 }
418 }
419
420 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
421 Value* vGatherInput[2],
422 Value* vGatherOutput[4],
423 bool bPackedOutput)
424 {
425 // cast types
426 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
427 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
428
429 // input could either be float or int vector; do shuffle work in int
430 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
431 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
432
433 if (bPackedOutput)
434 {
435 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
436 mVWidth / 4); // vwidth is units of 32 bits
437
438 // shuffle mask
439 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
440 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
441 Value* vShufResult =
442 BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
443 // after pshufb: group components together in each 128bit lane
444 // 256i - 0 1 2 3 4 5 6 7
445 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
446
447 Value* vi128XY =
448 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
449 // after PERMD: move and pack xy components into each 128bit lane
450 // 256i - 0 1 2 3 4 5 6 7
451 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
452
453 // do the same for zw components
454 Value* vi128ZW = nullptr;
455 if (info.numComps > 2)
456 {
457 Value* vShufResult =
458 BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
459 vi128ZW =
460 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
461 }
462
463 for (uint32_t i = 0; i < 4; i++)
464 {
465 uint32_t swizzleIndex = info.swizzle[i];
466 // todo: fixed for packed
467 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
468 if (i >= info.numComps)
469 {
470 // set the default component val
471 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
472 continue;
473 }
474
475 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
476 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
477 // if x or y, use vi128XY permute result, else use vi128ZW
478 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
479
480 // extract packed component 128 bit lanes
481 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
482 }
483 }
484 else
485 {
486 // pshufb masks for each component
487 Value* vConstMask[2];
488 // x/z shuffle mask
489 vConstMask[0] = C<char>({
490 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
491 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
492 });
493
494 // y/w shuffle mask
495 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
496 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
497
498 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
499 // apply defaults
500 for (uint32_t i = 0; i < 4; ++i)
501 {
502 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
503 }
504
505 for (uint32_t i = 0; i < info.numComps; i++)
506 {
507 uint32_t swizzleIndex = info.swizzle[i];
508
509 // select correct constMask for x/z or y/w pshufb
510 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
511 // if x or y, use vi128XY permute result, else use vi128ZW
512 uint32_t selectedGather = (i < 2) ? 0 : 1;
513
514 vGatherOutput[swizzleIndex] =
515 BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
516 vConstMask[selectedMask]),
517 vGatherTy);
518 // after pshufb mask for x channel; z uses the same shuffle from the second gather
519 // 256i - 0 1 2 3 4 5 6 7
520 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
521 }
522 }
523 }
524
525 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
526 Value* vGatherInput,
527 Value* vGatherOutput[],
528 bool bPackedOutput)
529 {
530 // cast types
531 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
532 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
533
534 if (bPackedOutput)
535 {
536 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
537 mVWidth / 4); // vwidth is units of 32 bits
538 // shuffle mask
539 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
540 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
541 Value* vShufResult =
542 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
543 // after pshufb: group components together in each 128bit lane
544 // 256i - 0 1 2 3 4 5 6 7
545 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
546
547 Value* vi128XY =
548 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
549 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
550 // 256i - 0 1 2 3 4 5 6 7
551 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
552
553 // do the same for zw components
554 Value* vi128ZW = nullptr;
555 if (info.numComps > 2)
556 {
557 vi128ZW =
558 BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
559 }
560
561 // sign extend all enabled components. If we have a fill vVertexElements, output to
562 // current simdvertex
563 for (uint32_t i = 0; i < 4; i++)
564 {
565 uint32_t swizzleIndex = info.swizzle[i];
566 // todo: fix for packed
567 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
568 if (i >= info.numComps)
569 {
570 // set the default component val
571 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
572 continue;
573 }
574
575 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
576 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
577 // if x or y, use vi128XY permute result, else use vi128ZW
578 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
579
580 // sign extend
581 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
582 }
583 }
584 // else zero extend
585 else
586 {
587 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
588 // apply defaults
589 for (uint32_t i = 0; i < 4; ++i)
590 {
591 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
592 }
593
594 for (uint32_t i = 0; i < info.numComps; i++)
595 {
596 uint32_t swizzleIndex = info.swizzle[i];
597
598 // pshufb masks for each component
599 Value* vConstMask;
600 switch (i)
601 {
602 case 0:
603 // x shuffle mask
604 vConstMask =
605 C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
606 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
607 break;
608 case 1:
609 // y shuffle mask
610 vConstMask =
611 C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
612 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
613 break;
614 case 2:
615 // z shuffle mask
616 vConstMask =
617 C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
618 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
619 break;
620 case 3:
621 // w shuffle mask
622 vConstMask =
623 C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
624 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
625 break;
626 default:
627 vConstMask = nullptr;
628 break;
629 }
630
631 vGatherOutput[swizzleIndex] =
632 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
633 // after pshufb for x channel
634 // 256i - 0 1 2 3 4 5 6 7
635 // x000 x000 x000 x000 x000 x000 x000 x000
636 }
637 }
638 }
639
640 //////////////////////////////////////////////////////////////////////////
641 /// @brief emulates a scatter operation.
642 /// @param pDst - pointer to destination
643 /// @param vSrc - vector of src data to scatter
644 /// @param vOffsets - vector of byte offsets from pDst
645 /// @param vMask - mask of valid lanes
646 void Builder::SCATTERPS(
647 Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, JIT_MEM_CLIENT usage)
648 {
649 AssertMemoryUsageParams(pDst, usage);
650
651 /* Scatter algorithm
652
653 while(Index = BitScanForward(mask))
654 srcElem = srcVector[Index]
655 offsetElem = offsetVector[Index]
656 *(pDst + offsetElem) = srcElem
657 Update mask (&= ~(1<<Index)
658
659 */
660
661 BasicBlock* pCurBB = IRB()->GetInsertBlock();
662 Function* pFunc = pCurBB->getParent();
663 Type* pSrcTy = vSrc->getType()->getVectorElementType();
664
665 // Store vectors on stack
666 if (pScatterStackSrc == nullptr)
667 {
668 // Save off stack allocations and reuse per scatter. Significantly reduces stack
669 // requirements for shaders with a lot of scatters.
670 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
671 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
672 }
673
674 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
675 Value* pOffsetsArrayPtr = pScatterStackOffsets;
676 STORE(vSrc, pSrcArrayPtr);
677 STORE(vOffsets, pOffsetsArrayPtr);
678
679 // Cast to pointers for random access
680 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
681 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
682
683 Value* pMask = VMOVMSK(vMask);
684
685 // Setup loop basic block
686 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
687
688 // compute first set bit
689 Value* pIndex = CTTZ(pMask, C(false));
690
691 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
692
693 // Split current block or create new one if building inline
694 BasicBlock* pPostLoop;
695 if (pCurBB->getTerminator())
696 {
697 pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
698
699 // Remove unconditional jump created by splitBasicBlock
700 pCurBB->getTerminator()->eraseFromParent();
701
702 // Add terminator to end of original block
703 IRB()->SetInsertPoint(pCurBB);
704
705 // Add conditional branch
706 COND_BR(pIsUndef, pPostLoop, pLoop);
707 }
708 else
709 {
710 pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
711
712 // Add conditional branch
713 COND_BR(pIsUndef, pPostLoop, pLoop);
714 }
715
716 // Add loop basic block contents
717 IRB()->SetInsertPoint(pLoop);
718 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
719 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
720
721 pIndexPhi->addIncoming(pIndex, pCurBB);
722 pMaskPhi->addIncoming(pMask, pCurBB);
723
724 // Extract elements for this index
725 Value* pSrcElem = LOADV(pSrcArrayPtr, {pIndexPhi});
726 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
727
728 // GEP to this offset in dst
729 Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
730 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
731 STORE(pSrcElem, pCurDst);
732
733 // Update the mask
734 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
735
736 // Terminator
737 Value* pNewIndex = CTTZ(pNewMask, C(false));
738
739 pIsUndef = ICMP_EQ(pNewIndex, C(32));
740 COND_BR(pIsUndef, pPostLoop, pLoop);
741
742 // Update phi edges
743 pIndexPhi->addIncoming(pNewIndex, pLoop);
744 pMaskPhi->addIncoming(pNewMask, pLoop);
745
746 // Move builder to beginning of post loop
747 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
748 }
749 } // namespace SwrJit