swr/rast: Fix 64bit float loads in x86 lowering pass
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_mem.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "common/rdtsc_buckets.h"
33
34 #include <cstdarg>
35
36
37 namespace SwrJit
38 {
39 void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
40 {
41 SWR_ASSERT(ptr->getType() != mInt64Ty, "Address appears to be GFX access. Requires translation through BuilderGfxMem.");
42 }
43
44 Value *Builder::GEP(Value *Ptr, Value *Idx, Type *Ty, const Twine &Name)
45 {
46 return IRB()->CreateGEP(Ptr, Idx, Name);
47 }
48
49 Value *Builder::GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name)
50 {
51 return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
52 }
53
54 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList, Type *Ty)
55 {
56 std::vector<Value*> indices;
57 for (auto i : indexList)
58 indices.push_back(i);
59 return GEPA(ptr, indices);
60 }
61
62 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList, Type *Ty)
63 {
64 std::vector<Value*> indices;
65 for (auto i : indexList)
66 indices.push_back(C(i));
67 return GEPA(ptr, indices);
68 }
69
70 Value *Builder::GEPA(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name)
71 {
72 return IRB()->CreateGEP(Ptr, IdxList, Name);
73 }
74
75 Value *Builder::GEPA(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name)
76 {
77 return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
78 }
79
80 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
81 {
82 std::vector<Value*> indices;
83 for (auto i : indexList)
84 indices.push_back(i);
85 return IN_BOUNDS_GEP(ptr, indices);
86 }
87
88 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
89 {
90 std::vector<Value*> indices;
91 for (auto i : indexList)
92 indices.push_back(C(i));
93 return IN_BOUNDS_GEP(ptr, indices);
94 }
95
96 LoadInst* Builder::LOAD(Value *Ptr, const char *Name, Type *Ty, JIT_MEM_CLIENT usage)
97 {
98 AssertMemoryUsageParams(Ptr, usage);
99 return IRB()->CreateLoad(Ptr, Name);
100 }
101
102 LoadInst* Builder::LOAD(Value *Ptr, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage)
103 {
104 AssertMemoryUsageParams(Ptr, usage);
105 return IRB()->CreateLoad(Ptr, Name);
106 }
107
108 LoadInst* Builder::LOAD(Type *Ty, Value *Ptr, const Twine &Name, JIT_MEM_CLIENT usage)
109 {
110 AssertMemoryUsageParams(Ptr, usage);
111 return IRB()->CreateLoad(Ty, Ptr, Name);
112 }
113
114 LoadInst* Builder::LOAD(Value *Ptr, bool isVolatile, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage)
115 {
116 AssertMemoryUsageParams(Ptr, usage);
117 return IRB()->CreateLoad(Ptr, isVolatile, Name);
118 }
119
120 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name, Type *Ty, JIT_MEM_CLIENT usage)
121 {
122 std::vector<Value*> valIndices;
123 for (auto i : indices)
124 valIndices.push_back(C(i));
125 return Builder::LOAD(GEPA(basePtr, valIndices), name);
126 }
127
128 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
129 {
130 std::vector<Value*> valIndices;
131 for (auto i : indices)
132 valIndices.push_back(i);
133 return LOAD(GEPA(basePtr, valIndices), name);
134 }
135
136 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
137 {
138 std::vector<Value*> valIndices;
139 for (auto i : indices)
140 valIndices.push_back(C(i));
141 return STORE(val, GEPA(basePtr, valIndices));
142 }
143
144 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
145 {
146 std::vector<Value*> valIndices;
147 for (auto i : indices)
148 valIndices.push_back(i);
149 return STORE(val, GEPA(basePtr, valIndices));
150 }
151
152 Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset)
153 {
154 return GEP(base, offset);
155 }
156
157 Value* Builder::MEM_ADD(Value* i32Incr, Value* basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
158 {
159 Value* i32Value = LOAD(GEP(basePtr, indices), name);
160 Value* i32Result = ADD(i32Value, i32Incr);
161 return STORE(i32Result, GEP(basePtr, indices));
162 }
163
164 //////////////////////////////////////////////////////////////////////////
165 /// @brief Generate a masked gather operation in LLVM IR. If not
166 /// supported on the underlying platform, emulate it with loads
167 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
168 /// @param pBase - Int8* base VB address pointer value
169 /// @param vIndices - SIMD wide value of VB byte offsets
170 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
171 /// @param scale - value to scale indices by
172 Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
173 {
174 AssertMemoryUsageParams(pBase, usage);
175
176 return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
177 }
178
179 //////////////////////////////////////////////////////////////////////////
180 /// @brief Generate a masked gather operation in LLVM IR. If not
181 /// supported on the underlying platform, emulate it with loads
182 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
183 /// @param pBase - Int8* base VB address pointer value
184 /// @param vIndices - SIMD wide value of VB byte offsets
185 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
186 /// @param scale - value to scale indices by
187 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage)
188 {
189 AssertMemoryUsageParams(pBase, usage);
190
191 return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
192 }
193
194 //////////////////////////////////////////////////////////////////////////
195 /// @brief Generate a masked gather operation in LLVM IR. If not
196 /// supported on the underlying platform, emulate it with loads
197 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
198 /// @param pBase - Int8* base VB address pointer value
199 /// @param vIndices - SIMD wide value of VB byte offsets
200 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
201 /// @param scale - value to scale indices by
202 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
203 {
204 return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
205 }
206
207 //////////////////////////////////////////////////////////////////////////
208 /// @brief Alternative masked gather where source is a vector of pointers
209 /// @param pVecSrcPtr - SIMD wide vector of pointers
210 /// @param pVecMask - SIMD active lanes
211 /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
212 Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
213 {
214 return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
215 }
216
217 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
218 Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
219 {
220 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
221 if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
222 {
223 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
224 }
225 else
226 {
227 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
228 }
229 }
230
231 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
232 Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
233 {
234 switch (info.bpp / info.numComps)
235 {
236 case 16:
237 {
238 Value* vGatherResult[2];
239
240 // TODO: vGatherMaskedVal
241 Value* vGatherMaskedVal = VIMMED1((float)0);
242
243 // always have at least one component out of x or y to fetch
244
245 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
246 // e.g. result of first 8x32bit integer gather for 16bit components
247 // 256i - 0 1 2 3 4 5 6 7
248 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
249 //
250
251 // if we have at least one component out of x or y to fetch
252 if (info.numComps > 2)
253 {
254 // offset base to the next components(zw) in the vertex to gather
255 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
256
257 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
258 // e.g. result of second 8x32bit integer gather for 16bit components
259 // 256i - 0 1 2 3 4 5 6 7
260 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
261 //
262 }
263 else
264 {
265 vGatherResult[1] = vGatherMaskedVal;
266 }
267
268 // Shuffle gathered components into place, each row is a component
269 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
270 }
271 break;
272 case 32:
273 {
274 // apply defaults
275 for (uint32_t i = 0; i < 4; ++i)
276 {
277 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
278 }
279
280 for (uint32_t i = 0; i < info.numComps; i++)
281 {
282 uint32_t swizzleIndex = info.swizzle[i];
283
284 // Gather a SIMD of components
285 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
286
287 // offset base to the next component to gather
288 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
289 }
290 }
291 break;
292 default:
293 SWR_INVALID("Invalid float format");
294 break;
295 }
296 }
297
298 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
299 Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
300 {
301 switch (info.bpp / info.numComps)
302 {
303 case 8:
304 {
305 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
306 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
307 // e.g. result of an 8x32bit integer gather for 8bit components
308 // 256i - 0 1 2 3 4 5 6 7
309 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
310
311 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
312 }
313 break;
314 case 16:
315 {
316 Value* vGatherResult[2];
317
318 // TODO: vGatherMaskedVal
319 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
320
321 // always have at least one component out of x or y to fetch
322
323 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
324 // e.g. result of first 8x32bit integer gather for 16bit components
325 // 256i - 0 1 2 3 4 5 6 7
326 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
327 //
328
329 // if we have at least one component out of x or y to fetch
330 if (info.numComps > 2)
331 {
332 // offset base to the next components(zw) in the vertex to gather
333 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
334
335 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
336 // e.g. result of second 8x32bit integer gather for 16bit components
337 // 256i - 0 1 2 3 4 5 6 7
338 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
339 //
340 }
341 else
342 {
343 vGatherResult[1] = vGatherMaskedVal;
344 }
345
346 // Shuffle gathered components into place, each row is a component
347 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
348
349 }
350 break;
351 case 32:
352 {
353 // apply defaults
354 for (uint32_t i = 0; i < 4; ++i)
355 {
356 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
357 }
358
359 for (uint32_t i = 0; i < info.numComps; i++)
360 {
361 uint32_t swizzleIndex = info.swizzle[i];
362
363 // Gather a SIMD of components
364 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
365
366 // offset base to the next component to gather
367 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
368 }
369 }
370 break;
371 default:
372 SWR_INVALID("unsupported format");
373 break;
374 }
375 }
376
377 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
378 {
379 // cast types
380 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
381 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
382
383 // input could either be float or int vector; do shuffle work in int
384 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
385 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
386
387 if (bPackedOutput)
388 {
389 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
390
391 // shuffle mask
392 Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
393 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
394 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
395 // after pshufb: group components together in each 128bit lane
396 // 256i - 0 1 2 3 4 5 6 7
397 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
398
399 Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
400 // after PERMD: move and pack xy components into each 128bit lane
401 // 256i - 0 1 2 3 4 5 6 7
402 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
403
404 // do the same for zw components
405 Value* vi128ZW = nullptr;
406 if (info.numComps > 2)
407 {
408 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
409 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
410 }
411
412 for (uint32_t i = 0; i < 4; i++)
413 {
414 uint32_t swizzleIndex = info.swizzle[i];
415 // todo: fixed for packed
416 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
417 if (i >= info.numComps)
418 {
419 // set the default component val
420 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
421 continue;
422 }
423
424 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
425 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
426 // if x or y, use vi128XY permute result, else use vi128ZW
427 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
428
429 // extract packed component 128 bit lanes
430 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
431 }
432
433 }
434 else
435 {
436 // pshufb masks for each component
437 Value* vConstMask[2];
438 // x/z shuffle mask
439 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
440 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
441
442 // y/w shuffle mask
443 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
444 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
445
446
447 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
448 // apply defaults
449 for (uint32_t i = 0; i < 4; ++i)
450 {
451 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
452 }
453
454 for (uint32_t i = 0; i < info.numComps; i++)
455 {
456 uint32_t swizzleIndex = info.swizzle[i];
457
458 // select correct constMask for x/z or y/w pshufb
459 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
460 // if x or y, use vi128XY permute result, else use vi128ZW
461 uint32_t selectedGather = (i < 2) ? 0 : 1;
462
463 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
464 // after pshufb mask for x channel; z uses the same shuffle from the second gather
465 // 256i - 0 1 2 3 4 5 6 7
466 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
467 }
468 }
469 }
470
471 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
472 {
473 // cast types
474 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
475 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
476
477 if (bPackedOutput)
478 {
479 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
480 // shuffle mask
481 Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
482 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
483 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
484 // after pshufb: group components together in each 128bit lane
485 // 256i - 0 1 2 3 4 5 6 7
486 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
487
488 Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
489 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
490 // 256i - 0 1 2 3 4 5 6 7
491 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
492
493 // do the same for zw components
494 Value* vi128ZW = nullptr;
495 if (info.numComps > 2)
496 {
497 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
498 }
499
500 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
501 for (uint32_t i = 0; i < 4; i++)
502 {
503 uint32_t swizzleIndex = info.swizzle[i];
504 // todo: fix for packed
505 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
506 if (i >= info.numComps)
507 {
508 // set the default component val
509 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
510 continue;
511 }
512
513 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
514 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
515 // if x or y, use vi128XY permute result, else use vi128ZW
516 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
517
518 // sign extend
519 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
520 }
521 }
522 // else zero extend
523 else {
524 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
525 // apply defaults
526 for (uint32_t i = 0; i < 4; ++i)
527 {
528 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
529 }
530
531 for (uint32_t i = 0; i < info.numComps; i++) {
532 uint32_t swizzleIndex = info.swizzle[i];
533
534 // pshufb masks for each component
535 Value* vConstMask;
536 switch (i)
537 {
538 case 0:
539 // x shuffle mask
540 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
541 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
542 break;
543 case 1:
544 // y shuffle mask
545 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
546 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
547 break;
548 case 2:
549 // z shuffle mask
550 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
551 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
552 break;
553 case 3:
554 // w shuffle mask
555 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
556 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
557 break;
558 default:
559 vConstMask = nullptr;
560 break;
561 }
562
563 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
564 // after pshufb for x channel
565 // 256i - 0 1 2 3 4 5 6 7
566 // x000 x000 x000 x000 x000 x000 x000 x000
567 }
568 }
569 }
570
571 //////////////////////////////////////////////////////////////////////////
572 /// @brief emulates a scatter operation.
573 /// @param pDst - pointer to destination
574 /// @param vSrc - vector of src data to scatter
575 /// @param vOffsets - vector of byte offsets from pDst
576 /// @param vMask - mask of valid lanes
577 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
578 {
579 /* Scatter algorithm
580
581 while(Index = BitScanForward(mask))
582 srcElem = srcVector[Index]
583 offsetElem = offsetVector[Index]
584 *(pDst + offsetElem) = srcElem
585 Update mask (&= ~(1<<Index)
586
587 */
588
589 BasicBlock* pCurBB = IRB()->GetInsertBlock();
590 Function* pFunc = pCurBB->getParent();
591 Type* pSrcTy = vSrc->getType()->getVectorElementType();
592
593 // Store vectors on stack
594 if (pScatterStackSrc == nullptr)
595 {
596 // Save off stack allocations and reuse per scatter. Significantly reduces stack
597 // requirements for shaders with a lot of scatters.
598 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
599 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
600 }
601
602 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
603 Value* pOffsetsArrayPtr = pScatterStackOffsets;
604 STORE(vSrc, pSrcArrayPtr);
605 STORE(vOffsets, pOffsetsArrayPtr);
606
607 // Cast to pointers for random access
608 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
609 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
610
611 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
612
613 // Setup loop basic block
614 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
615
616 // compute first set bit
617 Value* pIndex = CTTZ(pMask, C(false));
618
619 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
620
621 // Split current block
622 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
623
624 // Remove unconditional jump created by splitBasicBlock
625 pCurBB->getTerminator()->eraseFromParent();
626
627 // Add terminator to end of original block
628 IRB()->SetInsertPoint(pCurBB);
629
630 // Add conditional branch
631 COND_BR(pIsUndef, pPostLoop, pLoop);
632
633 // Add loop basic block contents
634 IRB()->SetInsertPoint(pLoop);
635 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
636 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
637
638 pIndexPhi->addIncoming(pIndex, pCurBB);
639 pMaskPhi->addIncoming(pMask, pCurBB);
640
641 // Extract elements for this index
642 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
643 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
644
645 // GEP to this offset in dst
646 Value* pCurDst = GEP(pDst, pOffsetElem);
647 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
648 STORE(pSrcElem, pCurDst);
649
650 // Update the mask
651 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
652
653 // Terminator
654 Value* pNewIndex = CTTZ(pNewMask, C(false));
655
656 pIsUndef = ICMP_EQ(pNewIndex, C(32));
657 COND_BR(pIsUndef, pPostLoop, pLoop);
658
659 // Update phi edges
660 pIndexPhi->addIncoming(pNewIndex, pLoop);
661 pMaskPhi->addIncoming(pNewMask, pLoop);
662
663 // Move builder to beginning of post loop
664 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
665 }
666 }