swr/rast: Enable generalized fetch jit
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_mem.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "common/rdtsc_buckets.h"
33
34 #include <cstdarg>
35
36
37 namespace SwrJit
38 {
39 void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
40 {
41 SWR_ASSERT(ptr->getType() != mInt64Ty, "Address appears to be GFX access. Requires translation through BuilderGfxMem.");
42 }
43
44 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
45 {
46 std::vector<Value*> indices;
47 for (auto i : indexList)
48 indices.push_back(i);
49 return GEPA(ptr, indices);
50 }
51
52 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
53 {
54 std::vector<Value*> indices;
55 for (auto i : indexList)
56 indices.push_back(C(i));
57 return GEPA(ptr, indices);
58 }
59
60 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
61 {
62 std::vector<Value*> indices;
63 for (auto i : indexList)
64 indices.push_back(i);
65 return IN_BOUNDS_GEP(ptr, indices);
66 }
67
68 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
69 {
70 std::vector<Value*> indices;
71 for (auto i : indexList)
72 indices.push_back(C(i));
73 return IN_BOUNDS_GEP(ptr, indices);
74 }
75
76 LoadInst* Builder::LOAD(Value *Ptr, const char *Name, JIT_MEM_CLIENT usage)
77 {
78 AssertMemoryUsageParams(Ptr, usage);
79 return IRB()->CreateLoad(Ptr, Name);
80 }
81
82 LoadInst* Builder::LOAD(Value *Ptr, const Twine &Name, JIT_MEM_CLIENT usage)
83 {
84 AssertMemoryUsageParams(Ptr, usage);
85 return IRB()->CreateLoad(Ptr, Name);
86 }
87
88 LoadInst* Builder::LOAD(Type *Ty, Value *Ptr, const Twine &Name, JIT_MEM_CLIENT usage)
89 {
90 AssertMemoryUsageParams(Ptr, usage);
91 return IRB()->CreateLoad(Ty, Ptr, Name);
92 }
93
94 LoadInst* Builder::LOAD(Value *Ptr, bool isVolatile, const Twine &Name, JIT_MEM_CLIENT usage)
95 {
96 AssertMemoryUsageParams(Ptr, usage);
97 return IRB()->CreateLoad(Ptr, isVolatile, Name);
98 }
99
100 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name, JIT_MEM_CLIENT usage)
101 {
102 AssertMemoryUsageParams(basePtr, usage);
103 std::vector<Value*> valIndices;
104 for (auto i : indices)
105 valIndices.push_back(C(i));
106 return LOAD(GEPA(basePtr, valIndices), name);
107 }
108
109 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
110 {
111 std::vector<Value*> valIndices;
112 for (auto i : indices)
113 valIndices.push_back(i);
114 return LOAD(GEPA(basePtr, valIndices), name);
115 }
116
117 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
118 {
119 std::vector<Value*> valIndices;
120 for (auto i : indices)
121 valIndices.push_back(C(i));
122 return STORE(val, GEPA(basePtr, valIndices));
123 }
124
125 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
126 {
127 std::vector<Value*> valIndices;
128 for (auto i : indices)
129 valIndices.push_back(i);
130 return STORE(val, GEPA(basePtr, valIndices));
131 }
132
133 Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset)
134 {
135 return GEP(base, offset);
136 }
137
138 Value* Builder::MEM_ADD(Value* i32Incr, Value* basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
139 {
140 Value* i32Value = LOAD(GEP(basePtr, indices), name);
141 Value* i32Result = ADD(i32Value, i32Incr);
142 return STORE(i32Result, GEP(basePtr, indices));
143 }
144
145 //////////////////////////////////////////////////////////////////////////
146 /// @brief Generate a masked gather operation in LLVM IR. If not
147 /// supported on the underlying platform, emulate it with loads
148 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
149 /// @param pBase - Int8* base VB address pointer value
150 /// @param vIndices - SIMD wide value of VB byte offsets
151 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
152 /// @param scale - value to scale indices by
153 Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
154 {
155 AssertMemoryUsageParams(pBase, usage);
156
157 return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
158 }
159
160 Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
161 {
162 AssertMemoryUsageParams(pBase, usage);
163
164 return VGATHERPS_16(vSrc, pBase, vIndices, vMask, C(scale));
165 }
166
167 //////////////////////////////////////////////////////////////////////////
168 /// @brief Generate a masked gather operation in LLVM IR. If not
169 /// supported on the underlying platform, emulate it with loads
170 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
171 /// @param pBase - Int8* base VB address pointer value
172 /// @param vIndices - SIMD wide value of VB byte offsets
173 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
174 /// @param scale - value to scale indices by
175 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage)
176 {
177 AssertMemoryUsageParams(pBase, usage);
178
179 return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
180 }
181
182 Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
183 {
184 AssertMemoryUsageParams(pBase, usage);
185
186 return VGATHERDD_16(vSrc, pBase, vIndices, vMask, C(scale));
187 }
188
189 //////////////////////////////////////////////////////////////////////////
190 /// @brief Generate a masked gather operation in LLVM IR. If not
191 /// supported on the underlying platform, emulate it with loads
192 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
193 /// @param pBase - Int8* base VB address pointer value
194 /// @param vIndices - SIMD wide value of VB byte offsets
195 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
196 /// @param scale - value to scale indices by
197 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
198 {
199 Value* vGather;
200
201 // use avx2 gather instruction if available
202 if (JM()->mArch.AVX2())
203 {
204 vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
205 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
206 }
207 else
208 {
209 Value* pStack = STACKSAVE();
210
211 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
212 Value* vSrcPtr = ALLOCA(vSrc->getType());
213 STORE(vSrc, vSrcPtr);
214
215 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
216 Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
217 Value *vOffsets = MUL(vIndices, vScaleVec);
218 for (uint32_t i = 0; i < mVWidth / 2; ++i)
219 {
220 // single component byte index
221 Value *offset = VEXTRACT(vOffsets, C(i));
222 // byte pointer to component
223 Value *loadAddress = GEP(pBase, offset);
224 loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
225 // pointer to the value to load if we're masking off a component
226 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
227 Value *selMask = VEXTRACT(vMask, C(i));
228 // switch in a safe address to load if we're trying to access a vertex
229 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
230 Value *val = LOAD(validAddress);
231 vGather = VINSERT(vGather, val, C(i));
232 }
233 STACKRESTORE(pStack);
234 }
235 return vGather;
236 }
237
238 //////////////////////////////////////////////////////////////////////////
239 /// @brief Alternative masked gather where source is a vector of pointers
240 /// @param pVecSrcPtr - SIMD wide vector of pointers
241 /// @param pVecMask - SIMD active lanes
242 /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
243 Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
244 {
245 return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
246 }
247
248 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
249 Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
250 {
251 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
252 if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
253 {
254 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
255 }
256 else
257 {
258 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
259 }
260 }
261
262 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
263 Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
264 {
265 switch (info.bpp / info.numComps)
266 {
267 case 16:
268 {
269 Value* vGatherResult[2];
270
271 // TODO: vGatherMaskedVal
272 Value* vGatherMaskedVal = VIMMED1((float)0);
273
274 // always have at least one component out of x or y to fetch
275
276 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
277 // e.g. result of first 8x32bit integer gather for 16bit components
278 // 256i - 0 1 2 3 4 5 6 7
279 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
280 //
281
282 // if we have at least one component out of x or y to fetch
283 if (info.numComps > 2)
284 {
285 // offset base to the next components(zw) in the vertex to gather
286 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
287
288 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
289 // e.g. result of second 8x32bit integer gather for 16bit components
290 // 256i - 0 1 2 3 4 5 6 7
291 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
292 //
293 }
294 else
295 {
296 vGatherResult[1] = vGatherMaskedVal;
297 }
298
299 // Shuffle gathered components into place, each row is a component
300 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
301 }
302 break;
303 case 32:
304 {
305 // apply defaults
306 for (uint32_t i = 0; i < 4; ++i)
307 {
308 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
309 }
310
311 for (uint32_t i = 0; i < info.numComps; i++)
312 {
313 uint32_t swizzleIndex = info.swizzle[i];
314
315 // Gather a SIMD of components
316 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
317
318 // offset base to the next component to gather
319 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
320 }
321 }
322 break;
323 default:
324 SWR_INVALID("Invalid float format");
325 break;
326 }
327 }
328
329 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
330 Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
331 {
332 switch (info.bpp / info.numComps)
333 {
334 case 8:
335 {
336 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
337 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
338 // e.g. result of an 8x32bit integer gather for 8bit components
339 // 256i - 0 1 2 3 4 5 6 7
340 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
341
342 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
343 }
344 break;
345 case 16:
346 {
347 Value* vGatherResult[2];
348
349 // TODO: vGatherMaskedVal
350 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
351
352 // always have at least one component out of x or y to fetch
353
354 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
355 // e.g. result of first 8x32bit integer gather for 16bit components
356 // 256i - 0 1 2 3 4 5 6 7
357 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
358 //
359
360 // if we have at least one component out of x or y to fetch
361 if (info.numComps > 2)
362 {
363 // offset base to the next components(zw) in the vertex to gather
364 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
365
366 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
367 // e.g. result of second 8x32bit integer gather for 16bit components
368 // 256i - 0 1 2 3 4 5 6 7
369 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
370 //
371 }
372 else
373 {
374 vGatherResult[1] = vGatherMaskedVal;
375 }
376
377 // Shuffle gathered components into place, each row is a component
378 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
379
380 }
381 break;
382 case 32:
383 {
384 // apply defaults
385 for (uint32_t i = 0; i < 4; ++i)
386 {
387 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
388 }
389
390 for (uint32_t i = 0; i < info.numComps; i++)
391 {
392 uint32_t swizzleIndex = info.swizzle[i];
393
394 // Gather a SIMD of components
395 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
396
397 // offset base to the next component to gather
398 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
399 }
400 }
401 break;
402 default:
403 SWR_INVALID("unsupported format");
404 break;
405 }
406 }
407
408 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
409 {
410 // cast types
411 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
412 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
413
414 // input could either be float or int vector; do shuffle work in int
415 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
416 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
417
418 if (bPackedOutput)
419 {
420 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
421
422 // shuffle mask
423 Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
424 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
425 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
426 // after pshufb: group components together in each 128bit lane
427 // 256i - 0 1 2 3 4 5 6 7
428 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
429
430 Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
431 // after PERMD: move and pack xy components into each 128bit lane
432 // 256i - 0 1 2 3 4 5 6 7
433 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
434
435 // do the same for zw components
436 Value* vi128ZW = nullptr;
437 if (info.numComps > 2)
438 {
439 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
440 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
441 }
442
443 for (uint32_t i = 0; i < 4; i++)
444 {
445 uint32_t swizzleIndex = info.swizzle[i];
446 // todo: fixed for packed
447 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
448 if (i >= info.numComps)
449 {
450 // set the default component val
451 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
452 continue;
453 }
454
455 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
456 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
457 // if x or y, use vi128XY permute result, else use vi128ZW
458 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
459
460 // extract packed component 128 bit lanes
461 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
462 }
463
464 }
465 else
466 {
467 // pshufb masks for each component
468 Value* vConstMask[2];
469 // x/z shuffle mask
470 vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
471 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
472
473 // y/w shuffle mask
474 vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
475 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
476
477
478 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
479 // apply defaults
480 for (uint32_t i = 0; i < 4; ++i)
481 {
482 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
483 }
484
485 for (uint32_t i = 0; i < info.numComps; i++)
486 {
487 uint32_t swizzleIndex = info.swizzle[i];
488
489 // select correct constMask for x/z or y/w pshufb
490 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
491 // if x or y, use vi128XY permute result, else use vi128ZW
492 uint32_t selectedGather = (i < 2) ? 0 : 1;
493
494 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
495 // after pshufb mask for x channel; z uses the same shuffle from the second gather
496 // 256i - 0 1 2 3 4 5 6 7
497 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
498 }
499 }
500 }
501
502 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
503 {
504 // cast types
505 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
506 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
507
508 if (bPackedOutput)
509 {
510 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
511 // shuffle mask
512 Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
513 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
514 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
515 // after pshufb: group components together in each 128bit lane
516 // 256i - 0 1 2 3 4 5 6 7
517 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
518
519 Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
520 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
521 // 256i - 0 1 2 3 4 5 6 7
522 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
523
524 // do the same for zw components
525 Value* vi128ZW = nullptr;
526 if (info.numComps > 2)
527 {
528 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
529 }
530
531 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
532 for (uint32_t i = 0; i < 4; i++)
533 {
534 uint32_t swizzleIndex = info.swizzle[i];
535 // todo: fix for packed
536 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
537 if (i >= info.numComps)
538 {
539 // set the default component val
540 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
541 continue;
542 }
543
544 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
545 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
546 // if x or y, use vi128XY permute result, else use vi128ZW
547 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
548
549 // sign extend
550 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
551 }
552 }
553 // else zero extend
554 else {
555 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
556 // apply defaults
557 for (uint32_t i = 0; i < 4; ++i)
558 {
559 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
560 }
561
562 for (uint32_t i = 0; i < info.numComps; i++) {
563 uint32_t swizzleIndex = info.swizzle[i];
564
565 // pshufb masks for each component
566 Value* vConstMask;
567 switch (i)
568 {
569 case 0:
570 // x shuffle mask
571 vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
572 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
573 break;
574 case 1:
575 // y shuffle mask
576 vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
577 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
578 break;
579 case 2:
580 // z shuffle mask
581 vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
582 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
583 break;
584 case 3:
585 // w shuffle mask
586 vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
587 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
588 break;
589 default:
590 vConstMask = nullptr;
591 break;
592 }
593
594 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
595 // after pshufb for x channel
596 // 256i - 0 1 2 3 4 5 6 7
597 // x000 x000 x000 x000 x000 x000 x000 x000
598 }
599 }
600 }
601
602 //////////////////////////////////////////////////////////////////////////
603 /// @brief emulates a scatter operation.
604 /// @param pDst - pointer to destination
605 /// @param vSrc - vector of src data to scatter
606 /// @param vOffsets - vector of byte offsets from pDst
607 /// @param vMask - mask of valid lanes
608 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
609 {
610 /* Scatter algorithm
611
612 while(Index = BitScanForward(mask))
613 srcElem = srcVector[Index]
614 offsetElem = offsetVector[Index]
615 *(pDst + offsetElem) = srcElem
616 Update mask (&= ~(1<<Index)
617
618 */
619
620 BasicBlock* pCurBB = IRB()->GetInsertBlock();
621 Function* pFunc = pCurBB->getParent();
622 Type* pSrcTy = vSrc->getType()->getVectorElementType();
623
624 // Store vectors on stack
625 if (pScatterStackSrc == nullptr)
626 {
627 // Save off stack allocations and reuse per scatter. Significantly reduces stack
628 // requirements for shaders with a lot of scatters.
629 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
630 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
631 }
632
633 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
634 Value* pOffsetsArrayPtr = pScatterStackOffsets;
635 STORE(vSrc, pSrcArrayPtr);
636 STORE(vOffsets, pOffsetsArrayPtr);
637
638 // Cast to pointers for random access
639 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
640 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
641
642 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
643
644 // Setup loop basic block
645 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
646
647 // compute first set bit
648 Value* pIndex = CTTZ(pMask, C(false));
649
650 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
651
652 // Split current block
653 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
654
655 // Remove unconditional jump created by splitBasicBlock
656 pCurBB->getTerminator()->eraseFromParent();
657
658 // Add terminator to end of original block
659 IRB()->SetInsertPoint(pCurBB);
660
661 // Add conditional branch
662 COND_BR(pIsUndef, pPostLoop, pLoop);
663
664 // Add loop basic block contents
665 IRB()->SetInsertPoint(pLoop);
666 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
667 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
668
669 pIndexPhi->addIncoming(pIndex, pCurBB);
670 pMaskPhi->addIncoming(pMask, pCurBB);
671
672 // Extract elements for this index
673 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
674 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
675
676 // GEP to this offset in dst
677 Value* pCurDst = GEP(pDst, pOffsetElem);
678 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
679 STORE(pSrcElem, pCurDst);
680
681 // Update the mask
682 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
683
684 // Terminator
685 Value* pNewIndex = CTTZ(pNewMask, C(false));
686
687 pIsUndef = ICMP_EQ(pNewIndex, C(32));
688 COND_BR(pIsUndef, pPostLoop, pLoop);
689
690 // Update phi edges
691 pIndexPhi->addIncoming(pNewIndex, pLoop);
692 pMaskPhi->addIncoming(pNewMask, pLoop);
693
694 // Move builder to beginning of post loop
695 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
696 }
697
698 }