1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file builder_misc.cpp
25 * @brief Implementation for miscellaneous builder functions
29 ******************************************************************************/
30 #include "jit_pch.hpp"
32 #include "common/rdtsc_buckets.h"
39 void Builder::AssertMemoryUsageParams(Value
* ptr
, JIT_MEM_CLIENT usage
)
41 SWR_ASSERT(ptr
->getType() != mInt64Ty
, "Address appears to be GFX access. Requires translation through BuilderGfxMem.");
44 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
46 std::vector
<Value
*> indices
;
47 for (auto i
: indexList
)
49 return GEPA(ptr
, indices
);
52 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
54 std::vector
<Value
*> indices
;
55 for (auto i
: indexList
)
56 indices
.push_back(C(i
));
57 return GEPA(ptr
, indices
);
60 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
62 std::vector
<Value
*> indices
;
63 for (auto i
: indexList
)
65 return IN_BOUNDS_GEP(ptr
, indices
);
68 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
70 std::vector
<Value
*> indices
;
71 for (auto i
: indexList
)
72 indices
.push_back(C(i
));
73 return IN_BOUNDS_GEP(ptr
, indices
);
76 LoadInst
* Builder::LOAD(Value
*Ptr
, const char *Name
, JIT_MEM_CLIENT usage
)
78 AssertMemoryUsageParams(Ptr
, usage
);
79 return IRB()->CreateLoad(Ptr
, Name
);
82 LoadInst
* Builder::LOAD(Value
*Ptr
, const Twine
&Name
, JIT_MEM_CLIENT usage
)
84 AssertMemoryUsageParams(Ptr
, usage
);
85 return IRB()->CreateLoad(Ptr
, Name
);
88 LoadInst
* Builder::LOAD(Type
*Ty
, Value
*Ptr
, const Twine
&Name
, JIT_MEM_CLIENT usage
)
90 AssertMemoryUsageParams(Ptr
, usage
);
91 return IRB()->CreateLoad(Ty
, Ptr
, Name
);
94 LoadInst
* Builder::LOAD(Value
*Ptr
, bool isVolatile
, const Twine
&Name
, JIT_MEM_CLIENT usage
)
96 AssertMemoryUsageParams(Ptr
, usage
);
97 return IRB()->CreateLoad(Ptr
, isVolatile
, Name
);
100 LoadInst
*Builder::LOAD(Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
, const llvm::Twine
& name
, JIT_MEM_CLIENT usage
)
102 AssertMemoryUsageParams(basePtr
, usage
);
103 std::vector
<Value
*> valIndices
;
104 for (auto i
: indices
)
105 valIndices
.push_back(C(i
));
106 return LOAD(GEPA(basePtr
, valIndices
), name
);
109 LoadInst
*Builder::LOADV(Value
*basePtr
, const std::initializer_list
<Value
*> &indices
, const llvm::Twine
& name
)
111 std::vector
<Value
*> valIndices
;
112 for (auto i
: indices
)
113 valIndices
.push_back(i
);
114 return LOAD(GEPA(basePtr
, valIndices
), name
);
117 StoreInst
*Builder::STORE(Value
*val
, Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
)
119 std::vector
<Value
*> valIndices
;
120 for (auto i
: indices
)
121 valIndices
.push_back(C(i
));
122 return STORE(val
, GEPA(basePtr
, valIndices
));
125 StoreInst
*Builder::STOREV(Value
*val
, Value
*basePtr
, const std::initializer_list
<Value
*> &indices
)
127 std::vector
<Value
*> valIndices
;
128 for (auto i
: indices
)
129 valIndices
.push_back(i
);
130 return STORE(val
, GEPA(basePtr
, valIndices
));
133 Value
* Builder::OFFSET_TO_NEXT_COMPONENT(Value
* base
, Constant
*offset
)
135 return GEP(base
, offset
);
138 Value
* Builder::MEM_ADD(Value
* i32Incr
, Value
* basePtr
, const std::initializer_list
<uint32_t> &indices
, const llvm::Twine
& name
)
140 Value
* i32Value
= LOAD(GEP(basePtr
, indices
), name
);
141 Value
* i32Result
= ADD(i32Value
, i32Incr
);
142 return STORE(i32Result
, GEP(basePtr
, indices
));
145 //////////////////////////////////////////////////////////////////////////
146 /// @brief Generate a masked gather operation in LLVM IR. If not
147 /// supported on the underlying platform, emulate it with loads
148 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
149 /// @param pBase - Int8* base VB address pointer value
150 /// @param vIndices - SIMD wide value of VB byte offsets
151 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
152 /// @param scale - value to scale indices by
153 Value
*Builder::GATHERPS(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
, JIT_MEM_CLIENT usage
)
155 AssertMemoryUsageParams(pBase
, usage
);
157 return VGATHERPS(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
160 Value
*Builder::GATHERPS_16(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
, JIT_MEM_CLIENT usage
)
162 AssertMemoryUsageParams(pBase
, usage
);
164 return VGATHERPS_16(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
167 //////////////////////////////////////////////////////////////////////////
168 /// @brief Generate a masked gather operation in LLVM IR. If not
169 /// supported on the underlying platform, emulate it with loads
170 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
171 /// @param pBase - Int8* base VB address pointer value
172 /// @param vIndices - SIMD wide value of VB byte offsets
173 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
174 /// @param scale - value to scale indices by
175 Value
*Builder::GATHERDD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
, JIT_MEM_CLIENT usage
)
177 AssertMemoryUsageParams(pBase
, usage
);
179 return VGATHERDD(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
182 Value
*Builder::GATHERDD_16(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
, JIT_MEM_CLIENT usage
)
184 AssertMemoryUsageParams(pBase
, usage
);
186 return VGATHERDD_16(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
189 //////////////////////////////////////////////////////////////////////////
190 /// @brief Generate a masked gather operation in LLVM IR. If not
191 /// supported on the underlying platform, emulate it with loads
192 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
193 /// @param pBase - Int8* base VB address pointer value
194 /// @param vIndices - SIMD wide value of VB byte offsets
195 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
196 /// @param scale - value to scale indices by
197 Value
*Builder::GATHERPD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
)
201 // use avx2 gather instruction if available
202 if (JM()->mArch
.AVX2())
204 vMask
= BITCAST(S_EXT(vMask
, VectorType::get(mInt64Ty
, mVWidth
/ 2)), VectorType::get(mDoubleTy
, mVWidth
/ 2));
205 vGather
= VGATHERPD(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
209 Value
* pStack
= STACKSAVE();
211 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
212 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
213 STORE(vSrc
, vSrcPtr
);
215 vGather
= UndefValue::get(VectorType::get(mDoubleTy
, 4));
216 Value
*vScaleVec
= VECTOR_SPLAT(4, C((uint32_t)scale
));
217 Value
*vOffsets
= MUL(vIndices
, vScaleVec
);
218 for (uint32_t i
= 0; i
< mVWidth
/ 2; ++i
)
220 // single component byte index
221 Value
*offset
= VEXTRACT(vOffsets
, C(i
));
222 // byte pointer to component
223 Value
*loadAddress
= GEP(pBase
, offset
);
224 loadAddress
= BITCAST(loadAddress
, PointerType::get(mDoubleTy
, 0));
225 // pointer to the value to load if we're masking off a component
226 Value
*maskLoadAddress
= GEP(vSrcPtr
, { C(0), C(i
) });
227 Value
*selMask
= VEXTRACT(vMask
, C(i
));
228 // switch in a safe address to load if we're trying to access a vertex
229 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
230 Value
*val
= LOAD(validAddress
);
231 vGather
= VINSERT(vGather
, val
, C(i
));
233 STACKRESTORE(pStack
);
238 //////////////////////////////////////////////////////////////////////////
239 /// @brief Alternative masked gather where source is a vector of pointers
240 /// @param pVecSrcPtr - SIMD wide vector of pointers
241 /// @param pVecMask - SIMD active lanes
242 /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
243 Value
* Builder::GATHER_PTR(Value
* pVecSrcPtr
, Value
* pVecMask
, Value
* pVecPassthru
)
245 return MASKED_GATHER(pVecSrcPtr
, 4, pVecMask
, pVecPassthru
);
248 void Builder::Gather4(const SWR_FORMAT format
, Value
* pSrcBase
, Value
* byteOffsets
,
249 Value
* mask
, Value
* vGatherComponents
[], bool bPackedOutput
, JIT_MEM_CLIENT usage
)
251 const SWR_FORMAT_INFO
&info
= GetFormatInfo(format
);
252 if (info
.type
[0] == SWR_TYPE_FLOAT
&& info
.bpc
[0] == 32)
254 GATHER4PS(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
, usage
);
258 GATHER4DD(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
, usage
);
262 void Builder::GATHER4PS(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
263 Value
* vMask
, Value
* vGatherComponents
[], bool bPackedOutput
, JIT_MEM_CLIENT usage
)
265 switch (info
.bpp
/ info
.numComps
)
269 Value
* vGatherResult
[2];
271 // TODO: vGatherMaskedVal
272 Value
* vGatherMaskedVal
= VIMMED1((float)0);
274 // always have at least one component out of x or y to fetch
276 vGatherResult
[0] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
277 // e.g. result of first 8x32bit integer gather for 16bit components
278 // 256i - 0 1 2 3 4 5 6 7
279 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
282 // if we have at least one component out of x or y to fetch
283 if (info
.numComps
> 2)
285 // offset base to the next components(zw) in the vertex to gather
286 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
288 vGatherResult
[1] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
289 // e.g. result of second 8x32bit integer gather for 16bit components
290 // 256i - 0 1 2 3 4 5 6 7
291 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
296 vGatherResult
[1] = vGatherMaskedVal
;
299 // Shuffle gathered components into place, each row is a component
300 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
306 for (uint32_t i
= 0; i
< 4; ++i
)
308 vGatherComponents
[i
] = VIMMED1(*(float*)&info
.defaults
[i
]);
311 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
313 uint32_t swizzleIndex
= info
.swizzle
[i
];
315 // Gather a SIMD of components
316 vGatherComponents
[swizzleIndex
] = GATHERPS(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, 1, usage
);
318 // offset base to the next component to gather
319 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
324 SWR_INVALID("Invalid float format");
329 void Builder::GATHER4DD(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
330 Value
* vMask
, Value
* vGatherComponents
[], bool bPackedOutput
, JIT_MEM_CLIENT usage
)
332 switch (info
.bpp
/ info
.numComps
)
336 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
337 Value
* vGatherResult
= GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
338 // e.g. result of an 8x32bit integer gather for 8bit components
339 // 256i - 0 1 2 3 4 5 6 7
340 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
342 Shuffle8bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
347 Value
* vGatherResult
[2];
349 // TODO: vGatherMaskedVal
350 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
352 // always have at least one component out of x or y to fetch
354 vGatherResult
[0] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
355 // e.g. result of first 8x32bit integer gather for 16bit components
356 // 256i - 0 1 2 3 4 5 6 7
357 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
360 // if we have at least one component out of x or y to fetch
361 if (info
.numComps
> 2)
363 // offset base to the next components(zw) in the vertex to gather
364 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
366 vGatherResult
[1] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
367 // e.g. result of second 8x32bit integer gather for 16bit components
368 // 256i - 0 1 2 3 4 5 6 7
369 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
374 vGatherResult
[1] = vGatherMaskedVal
;
377 // Shuffle gathered components into place, each row is a component
378 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
385 for (uint32_t i
= 0; i
< 4; ++i
)
387 vGatherComponents
[i
] = VIMMED1((int)info
.defaults
[i
]);
390 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
392 uint32_t swizzleIndex
= info
.swizzle
[i
];
394 // Gather a SIMD of components
395 vGatherComponents
[swizzleIndex
] = GATHERDD(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, 1, usage
);
397 // offset base to the next component to gather
398 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
403 SWR_INVALID("unsupported format");
408 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
[2], Value
* vGatherOutput
[4], bool bPackedOutput
)
411 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
412 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
414 // input could either be float or int vector; do shuffle work in int
415 vGatherInput
[0] = BITCAST(vGatherInput
[0], mSimdInt32Ty
);
416 vGatherInput
[1] = BITCAST(vGatherInput
[1], mSimdInt32Ty
);
420 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
423 Value
* vConstMask
= C
<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
424 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
425 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[0], v32x8Ty
), vConstMask
), vGatherTy
);
426 // after pshufb: group components together in each 128bit lane
427 // 256i - 0 1 2 3 4 5 6 7
428 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
430 Value
* vi128XY
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
431 // after PERMD: move and pack xy components into each 128bit lane
432 // 256i - 0 1 2 3 4 5 6 7
433 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
435 // do the same for zw components
436 Value
* vi128ZW
= nullptr;
437 if (info
.numComps
> 2)
439 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[1], v32x8Ty
), vConstMask
), vGatherTy
);
440 vi128ZW
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
443 for (uint32_t i
= 0; i
< 4; i
++)
445 uint32_t swizzleIndex
= info
.swizzle
[i
];
446 // todo: fixed for packed
447 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
448 if (i
>= info
.numComps
)
450 // set the default component val
451 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
455 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
456 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
457 // if x or y, use vi128XY permute result, else use vi128ZW
458 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
460 // extract packed component 128 bit lanes
461 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
467 // pshufb masks for each component
468 Value
* vConstMask
[2];
470 vConstMask
[0] = C
<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
471 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
474 vConstMask
[1] = C
<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
475 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
478 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
480 for (uint32_t i
= 0; i
< 4; ++i
)
482 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
485 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
487 uint32_t swizzleIndex
= info
.swizzle
[i
];
489 // select correct constMask for x/z or y/w pshufb
490 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
491 // if x or y, use vi128XY permute result, else use vi128ZW
492 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
494 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
[selectedGather
], v32x8Ty
), vConstMask
[selectedMask
]), vGatherTy
);
495 // after pshufb mask for x channel; z uses the same shuffle from the second gather
496 // 256i - 0 1 2 3 4 5 6 7
497 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
502 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
, Value
* vGatherOutput
[], bool bPackedOutput
)
505 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
506 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
510 Type
* v128Ty
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
512 Value
* vConstMask
= C
<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
513 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
514 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
515 // after pshufb: group components together in each 128bit lane
516 // 256i - 0 1 2 3 4 5 6 7
517 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
519 Value
* vi128XY
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty
);
520 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
521 // 256i - 0 1 2 3 4 5 6 7
522 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
524 // do the same for zw components
525 Value
* vi128ZW
= nullptr;
526 if (info
.numComps
> 2)
528 vi128ZW
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty
);
531 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
532 for (uint32_t i
= 0; i
< 4; i
++)
534 uint32_t swizzleIndex
= info
.swizzle
[i
];
535 // todo: fix for packed
536 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
537 if (i
>= info
.numComps
)
539 // set the default component val
540 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
544 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
545 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
546 // if x or y, use vi128XY permute result, else use vi128ZW
547 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
550 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
555 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
557 for (uint32_t i
= 0; i
< 4; ++i
)
559 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
562 for (uint32_t i
= 0; i
< info
.numComps
; i
++) {
563 uint32_t swizzleIndex
= info
.swizzle
[i
];
565 // pshufb masks for each component
571 vConstMask
= C
<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
572 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
576 vConstMask
= C
<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
577 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
581 vConstMask
= C
<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
582 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
586 vConstMask
= C
<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
587 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
590 vConstMask
= nullptr;
594 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
595 // after pshufb for x channel
596 // 256i - 0 1 2 3 4 5 6 7
597 // x000 x000 x000 x000 x000 x000 x000 x000
602 //////////////////////////////////////////////////////////////////////////
603 /// @brief emulates a scatter operation.
604 /// @param pDst - pointer to destination
605 /// @param vSrc - vector of src data to scatter
606 /// @param vOffsets - vector of byte offsets from pDst
607 /// @param vMask - mask of valid lanes
608 void Builder::SCATTERPS(Value
* pDst
, Value
* vSrc
, Value
* vOffsets
, Value
* vMask
)
612 while(Index = BitScanForward(mask))
613 srcElem = srcVector[Index]
614 offsetElem = offsetVector[Index]
615 *(pDst + offsetElem) = srcElem
616 Update mask (&= ~(1<<Index)
620 BasicBlock
* pCurBB
= IRB()->GetInsertBlock();
621 Function
* pFunc
= pCurBB
->getParent();
622 Type
* pSrcTy
= vSrc
->getType()->getVectorElementType();
624 // Store vectors on stack
625 if (pScatterStackSrc
== nullptr)
627 // Save off stack allocations and reuse per scatter. Significantly reduces stack
628 // requirements for shaders with a lot of scatters.
629 pScatterStackSrc
= CreateEntryAlloca(pFunc
, mSimdInt64Ty
);
630 pScatterStackOffsets
= CreateEntryAlloca(pFunc
, mSimdInt32Ty
);
633 Value
* pSrcArrayPtr
= BITCAST(pScatterStackSrc
, PointerType::get(vSrc
->getType(), 0));
634 Value
* pOffsetsArrayPtr
= pScatterStackOffsets
;
635 STORE(vSrc
, pSrcArrayPtr
);
636 STORE(vOffsets
, pOffsetsArrayPtr
);
638 // Cast to pointers for random access
639 pSrcArrayPtr
= POINTER_CAST(pSrcArrayPtr
, PointerType::get(pSrcTy
, 0));
640 pOffsetsArrayPtr
= POINTER_CAST(pOffsetsArrayPtr
, PointerType::get(mInt32Ty
, 0));
642 Value
* pMask
= VMOVMSKPS(BITCAST(vMask
, mSimdFP32Ty
));
644 // Setup loop basic block
645 BasicBlock
* pLoop
= BasicBlock::Create(mpJitMgr
->mContext
, "Scatter_Loop", pFunc
);
647 // compute first set bit
648 Value
* pIndex
= CTTZ(pMask
, C(false));
650 Value
* pIsUndef
= ICMP_EQ(pIndex
, C(32));
652 // Split current block
653 BasicBlock
* pPostLoop
= pCurBB
->splitBasicBlock(cast
<Instruction
>(pIsUndef
)->getNextNode());
655 // Remove unconditional jump created by splitBasicBlock
656 pCurBB
->getTerminator()->eraseFromParent();
658 // Add terminator to end of original block
659 IRB()->SetInsertPoint(pCurBB
);
661 // Add conditional branch
662 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
664 // Add loop basic block contents
665 IRB()->SetInsertPoint(pLoop
);
666 PHINode
* pIndexPhi
= PHI(mInt32Ty
, 2);
667 PHINode
* pMaskPhi
= PHI(mInt32Ty
, 2);
669 pIndexPhi
->addIncoming(pIndex
, pCurBB
);
670 pMaskPhi
->addIncoming(pMask
, pCurBB
);
672 // Extract elements for this index
673 Value
* pSrcElem
= LOADV(pSrcArrayPtr
, { pIndexPhi
});
674 Value
* pOffsetElem
= LOADV(pOffsetsArrayPtr
, { pIndexPhi
});
676 // GEP to this offset in dst
677 Value
* pCurDst
= GEP(pDst
, pOffsetElem
);
678 pCurDst
= POINTER_CAST(pCurDst
, PointerType::get(pSrcTy
, 0));
679 STORE(pSrcElem
, pCurDst
);
682 Value
* pNewMask
= AND(pMaskPhi
, NOT(SHL(C(1), pIndexPhi
)));
685 Value
* pNewIndex
= CTTZ(pNewMask
, C(false));
687 pIsUndef
= ICMP_EQ(pNewIndex
, C(32));
688 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
691 pIndexPhi
->addIncoming(pNewIndex
, pLoop
);
692 pMaskPhi
->addIncoming(pNewMask
, pLoop
);
694 // Move builder to beginning of post loop
695 IRB()->SetInsertPoint(pPostLoop
, pPostLoop
->begin());