1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file builder_misc.cpp
25 * @brief Implementation for miscellaneous builder functions
29 ******************************************************************************/
30 #include "jit_pch.hpp"
32 #include "common/rdtsc_buckets.h"
38 void Builder::AssertMemoryUsageParams(Value
* ptr
, JIT_MEM_CLIENT usage
)
41 ptr
->getType() != mInt64Ty
,
42 "Address appears to be GFX access. Requires translation through BuilderGfxMem.");
45 Value
* Builder::GEP(Value
* Ptr
, Value
* Idx
, Type
* Ty
, const Twine
& Name
)
47 return IRB()->CreateGEP(Ptr
, Idx
, Name
);
50 Value
* Builder::GEP(Type
* Ty
, Value
* Ptr
, Value
* Idx
, const Twine
& Name
)
52 return IRB()->CreateGEP(Ty
, Ptr
, Idx
, Name
);
55 Value
* Builder::GEP(Value
* ptr
, const std::initializer_list
<Value
*>& indexList
, Type
* Ty
)
57 std::vector
<Value
*> indices
;
58 for (auto i
: indexList
)
60 return GEPA(ptr
, indices
);
63 Value
* Builder::GEP(Value
* ptr
, const std::initializer_list
<uint32_t>& indexList
, Type
* Ty
)
65 std::vector
<Value
*> indices
;
66 for (auto i
: indexList
)
67 indices
.push_back(C(i
));
68 return GEPA(ptr
, indices
);
71 Value
* Builder::GEPA(Value
* Ptr
, ArrayRef
<Value
*> IdxList
, const Twine
& Name
)
73 return IRB()->CreateGEP(Ptr
, IdxList
, Name
);
76 Value
* Builder::GEPA(Type
* Ty
, Value
* Ptr
, ArrayRef
<Value
*> IdxList
, const Twine
& Name
)
78 return IRB()->CreateGEP(Ty
, Ptr
, IdxList
, Name
);
81 Value
* Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<Value
*>& indexList
)
83 std::vector
<Value
*> indices
;
84 for (auto i
: indexList
)
86 return IN_BOUNDS_GEP(ptr
, indices
);
89 Value
* Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<uint32_t>& indexList
)
91 std::vector
<Value
*> indices
;
92 for (auto i
: indexList
)
93 indices
.push_back(C(i
));
94 return IN_BOUNDS_GEP(ptr
, indices
);
97 LoadInst
* Builder::LOAD(Value
* Ptr
, const char* Name
, Type
* Ty
, JIT_MEM_CLIENT usage
)
99 AssertMemoryUsageParams(Ptr
, usage
);
100 return IRB()->CreateLoad(Ptr
, Name
);
103 LoadInst
* Builder::LOAD(Value
* Ptr
, const Twine
& Name
, Type
* Ty
, JIT_MEM_CLIENT usage
)
105 AssertMemoryUsageParams(Ptr
, usage
);
106 return IRB()->CreateLoad(Ptr
, Name
);
109 LoadInst
* Builder::LOAD(Type
* Ty
, Value
* Ptr
, const Twine
& Name
, JIT_MEM_CLIENT usage
)
111 AssertMemoryUsageParams(Ptr
, usage
);
112 return IRB()->CreateLoad(Ty
, Ptr
, Name
);
116 Builder::LOAD(Value
* Ptr
, bool isVolatile
, const Twine
& Name
, Type
* Ty
, JIT_MEM_CLIENT usage
)
118 AssertMemoryUsageParams(Ptr
, usage
);
119 return IRB()->CreateLoad(Ptr
, isVolatile
, Name
);
122 LoadInst
* Builder::LOAD(Value
* basePtr
,
123 const std::initializer_list
<uint32_t>& indices
,
124 const llvm::Twine
& name
,
126 JIT_MEM_CLIENT usage
)
128 std::vector
<Value
*> valIndices
;
129 for (auto i
: indices
)
130 valIndices
.push_back(C(i
));
131 return Builder::LOAD(GEPA(basePtr
, valIndices
), name
);
134 LoadInst
* Builder::LOADV(Value
* basePtr
,
135 const std::initializer_list
<Value
*>& indices
,
136 const llvm::Twine
& name
)
138 std::vector
<Value
*> valIndices
;
139 for (auto i
: indices
)
140 valIndices
.push_back(i
);
141 return LOAD(GEPA(basePtr
, valIndices
), name
);
145 Builder::STORE(Value
* val
, Value
* basePtr
, const std::initializer_list
<uint32_t>& indices
)
147 std::vector
<Value
*> valIndices
;
148 for (auto i
: indices
)
149 valIndices
.push_back(C(i
));
150 return STORE(val
, GEPA(basePtr
, valIndices
));
154 Builder::STOREV(Value
* val
, Value
* basePtr
, const std::initializer_list
<Value
*>& indices
)
156 std::vector
<Value
*> valIndices
;
157 for (auto i
: indices
)
158 valIndices
.push_back(i
);
159 return STORE(val
, GEPA(basePtr
, valIndices
));
162 Value
* Builder::OFFSET_TO_NEXT_COMPONENT(Value
* base
, Constant
* offset
)
164 return GEP(base
, offset
);
167 Value
* Builder::MEM_ADD(Value
* i32Incr
,
169 const std::initializer_list
<uint32_t>& indices
,
170 const llvm::Twine
& name
)
172 Value
* i32Value
= LOAD(GEP(basePtr
, indices
), name
);
173 Value
* i32Result
= ADD(i32Value
, i32Incr
);
174 return STORE(i32Result
, GEP(basePtr
, indices
));
177 //////////////////////////////////////////////////////////////////////////
178 /// @brief Generate a masked gather operation in LLVM IR. If not
179 /// supported on the underlying platform, emulate it with loads
180 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
181 /// @param pBase - Int8* base VB address pointer value
182 /// @param vIndices - SIMD wide value of VB byte offsets
183 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
184 /// @param scale - value to scale indices by
185 Value
* Builder::GATHERPS(Value
* vSrc
,
190 JIT_MEM_CLIENT usage
)
192 AssertMemoryUsageParams(pBase
, usage
);
194 return VGATHERPS(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
197 //////////////////////////////////////////////////////////////////////////
198 /// @brief Generate a masked gather operation in LLVM IR. If not
199 /// supported on the underlying platform, emulate it with loads
200 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
201 /// @param pBase - Int8* base VB address pointer value
202 /// @param vIndices - SIMD wide value of VB byte offsets
203 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
204 /// @param scale - value to scale indices by
205 Value
* Builder::GATHERDD(Value
* vSrc
,
210 JIT_MEM_CLIENT usage
)
212 AssertMemoryUsageParams(pBase
, usage
);
214 return VGATHERDD(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
217 //////////////////////////////////////////////////////////////////////////
218 /// @brief Generate a masked gather operation in LLVM IR. If not
219 /// supported on the underlying platform, emulate it with loads
220 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
221 /// @param pBase - Int8* base VB address pointer value
222 /// @param vIndices - SIMD wide value of VB byte offsets
223 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
224 /// @param scale - value to scale indices by
226 Builder::GATHERPD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
)
228 return VGATHERPD(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
231 //////////////////////////////////////////////////////////////////////////
232 /// @brief Alternative masked gather where source is a vector of pointers
233 /// @param pVecSrcPtr - SIMD wide vector of pointers
234 /// @param pVecMask - SIMD active lanes
235 /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
236 Value
* Builder::GATHER_PTR(Value
* pVecSrcPtr
, Value
* pVecMask
, Value
* pVecPassthru
)
238 return MASKED_GATHER(pVecSrcPtr
, 4, pVecMask
, pVecPassthru
);
241 void Builder::Gather4(const SWR_FORMAT format
,
245 Value
* vGatherComponents
[],
247 JIT_MEM_CLIENT usage
)
249 const SWR_FORMAT_INFO
& info
= GetFormatInfo(format
);
250 if (info
.type
[0] == SWR_TYPE_FLOAT
&& info
.bpc
[0] == 32)
252 GATHER4PS(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
, usage
);
256 GATHER4DD(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
, usage
);
260 void Builder::GATHER4PS(const SWR_FORMAT_INFO
& info
,
264 Value
* vGatherComponents
[],
266 JIT_MEM_CLIENT usage
)
268 switch (info
.bpp
/ info
.numComps
)
272 Value
* vGatherResult
[2];
274 // TODO: vGatherMaskedVal
275 Value
* vGatherMaskedVal
= VIMMED1((float)0);
277 // always have at least one component out of x or y to fetch
279 vGatherResult
[0] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
280 // e.g. result of first 8x32bit integer gather for 16bit components
281 // 256i - 0 1 2 3 4 5 6 7
282 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
285 // if we have at least one component out of x or y to fetch
286 if (info
.numComps
> 2)
288 // offset base to the next components(zw) in the vertex to gather
289 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
292 GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
293 // e.g. result of second 8x32bit integer gather for 16bit components
294 // 256i - 0 1 2 3 4 5 6 7
295 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
300 vGatherResult
[1] = vGatherMaskedVal
;
303 // Shuffle gathered components into place, each row is a component
304 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
310 for (uint32_t i
= 0; i
< 4; ++i
)
312 vGatherComponents
[i
] = VIMMED1(*(float*)&info
.defaults
[i
]);
315 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
317 uint32_t swizzleIndex
= info
.swizzle
[i
];
319 // Gather a SIMD of components
320 vGatherComponents
[swizzleIndex
] = GATHERPS(
321 vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, 1, usage
);
323 // offset base to the next component to gather
324 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
329 SWR_INVALID("Invalid float format");
334 void Builder::GATHER4DD(const SWR_FORMAT_INFO
& info
,
338 Value
* vGatherComponents
[],
340 JIT_MEM_CLIENT usage
)
342 switch (info
.bpp
/ info
.numComps
)
346 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
347 Value
* vGatherResult
=
348 GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
349 // e.g. result of an 8x32bit integer gather for 8bit components
350 // 256i - 0 1 2 3 4 5 6 7
351 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
353 Shuffle8bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
358 Value
* vGatherResult
[2];
360 // TODO: vGatherMaskedVal
361 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
363 // always have at least one component out of x or y to fetch
365 vGatherResult
[0] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
366 // e.g. result of first 8x32bit integer gather for 16bit components
367 // 256i - 0 1 2 3 4 5 6 7
368 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
371 // if we have at least one component out of x or y to fetch
372 if (info
.numComps
> 2)
374 // offset base to the next components(zw) in the vertex to gather
375 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
378 GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
379 // e.g. result of second 8x32bit integer gather for 16bit components
380 // 256i - 0 1 2 3 4 5 6 7
381 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
386 vGatherResult
[1] = vGatherMaskedVal
;
389 // Shuffle gathered components into place, each row is a component
390 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
396 for (uint32_t i
= 0; i
< 4; ++i
)
398 vGatherComponents
[i
] = VIMMED1((int)info
.defaults
[i
]);
401 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
403 uint32_t swizzleIndex
= info
.swizzle
[i
];
405 // Gather a SIMD of components
406 vGatherComponents
[swizzleIndex
] = GATHERDD(
407 vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, 1, usage
);
409 // offset base to the next component to gather
410 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
415 SWR_INVALID("unsupported format");
420 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO
& info
,
421 Value
* vGatherInput
[2],
422 Value
* vGatherOutput
[4],
426 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
427 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
429 // input could either be float or int vector; do shuffle work in int
430 vGatherInput
[0] = BITCAST(vGatherInput
[0], mSimdInt32Ty
);
431 vGatherInput
[1] = BITCAST(vGatherInput
[1], mSimdInt32Ty
);
435 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128),
436 mVWidth
/ 4); // vwidth is units of 32 bits
439 Value
* vConstMask
= C
<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
440 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
442 BITCAST(PSHUFB(BITCAST(vGatherInput
[0], v32x8Ty
), vConstMask
), vGatherTy
);
443 // after pshufb: group components together in each 128bit lane
444 // 256i - 0 1 2 3 4 5 6 7
445 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
448 BITCAST(VPERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
449 // after PERMD: move and pack xy components into each 128bit lane
450 // 256i - 0 1 2 3 4 5 6 7
451 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
453 // do the same for zw components
454 Value
* vi128ZW
= nullptr;
455 if (info
.numComps
> 2)
458 BITCAST(PSHUFB(BITCAST(vGatherInput
[1], v32x8Ty
), vConstMask
), vGatherTy
);
460 BITCAST(VPERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
463 for (uint32_t i
= 0; i
< 4; i
++)
465 uint32_t swizzleIndex
= info
.swizzle
[i
];
466 // todo: fixed for packed
467 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
468 if (i
>= info
.numComps
)
470 // set the default component val
471 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
475 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
476 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
477 // if x or y, use vi128XY permute result, else use vi128ZW
478 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
480 // extract packed component 128 bit lanes
481 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
486 // pshufb masks for each component
487 Value
* vConstMask
[2];
489 vConstMask
[0] = C
<char>({
490 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
491 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
495 vConstMask
[1] = C
<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
496 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
498 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
500 for (uint32_t i
= 0; i
< 4; ++i
)
502 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
505 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
507 uint32_t swizzleIndex
= info
.swizzle
[i
];
509 // select correct constMask for x/z or y/w pshufb
510 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
511 // if x or y, use vi128XY permute result, else use vi128ZW
512 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
514 vGatherOutput
[swizzleIndex
] =
515 BITCAST(PSHUFB(BITCAST(vGatherInput
[selectedGather
], v32x8Ty
),
516 vConstMask
[selectedMask
]),
518 // after pshufb mask for x channel; z uses the same shuffle from the second gather
519 // 256i - 0 1 2 3 4 5 6 7
520 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
525 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO
& info
,
527 Value
* vGatherOutput
[],
531 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
532 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
536 Type
* v128Ty
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128),
537 mVWidth
/ 4); // vwidth is units of 32 bits
539 Value
* vConstMask
= C
<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
540 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
542 BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
543 // after pshufb: group components together in each 128bit lane
544 // 256i - 0 1 2 3 4 5 6 7
545 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
548 BITCAST(VPERMD(vShufResult
, C
<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty
);
549 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
550 // 256i - 0 1 2 3 4 5 6 7
551 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
553 // do the same for zw components
554 Value
* vi128ZW
= nullptr;
555 if (info
.numComps
> 2)
558 BITCAST(VPERMD(vShufResult
, C
<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty
);
561 // sign extend all enabled components. If we have a fill vVertexElements, output to
562 // current simdvertex
563 for (uint32_t i
= 0; i
< 4; i
++)
565 uint32_t swizzleIndex
= info
.swizzle
[i
];
566 // todo: fix for packed
567 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
568 if (i
>= info
.numComps
)
570 // set the default component val
571 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
575 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
576 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
577 // if x or y, use vi128XY permute result, else use vi128ZW
578 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
581 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
587 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
589 for (uint32_t i
= 0; i
< 4; ++i
)
591 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
594 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
596 uint32_t swizzleIndex
= info
.swizzle
[i
];
598 // pshufb masks for each component
605 C
<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
606 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
611 C
<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
612 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
617 C
<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
618 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
623 C
<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
624 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
627 vConstMask
= nullptr;
631 vGatherOutput
[swizzleIndex
] =
632 BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
633 // after pshufb for x channel
634 // 256i - 0 1 2 3 4 5 6 7
635 // x000 x000 x000 x000 x000 x000 x000 x000
640 //////////////////////////////////////////////////////////////////////////
641 /// @brief emulates a scatter operation.
642 /// @param pDst - pointer to destination
643 /// @param vSrc - vector of src data to scatter
644 /// @param vOffsets - vector of byte offsets from pDst
645 /// @param vMask - mask of valid lanes
646 void Builder::SCATTERPS(
647 Value
* pDst
, Value
* vSrc
, Value
* vOffsets
, Value
* vMask
, JIT_MEM_CLIENT usage
)
649 AssertMemoryUsageParams(pDst
, usage
);
653 while(Index = BitScanForward(mask))
654 srcElem = srcVector[Index]
655 offsetElem = offsetVector[Index]
656 *(pDst + offsetElem) = srcElem
657 Update mask (&= ~(1<<Index)
661 BasicBlock
* pCurBB
= IRB()->GetInsertBlock();
662 Function
* pFunc
= pCurBB
->getParent();
663 Type
* pSrcTy
= vSrc
->getType()->getVectorElementType();
665 // Store vectors on stack
666 if (pScatterStackSrc
== nullptr)
668 // Save off stack allocations and reuse per scatter. Significantly reduces stack
669 // requirements for shaders with a lot of scatters.
670 pScatterStackSrc
= CreateEntryAlloca(pFunc
, mSimdInt64Ty
);
671 pScatterStackOffsets
= CreateEntryAlloca(pFunc
, mSimdInt32Ty
);
674 Value
* pSrcArrayPtr
= BITCAST(pScatterStackSrc
, PointerType::get(vSrc
->getType(), 0));
675 Value
* pOffsetsArrayPtr
= pScatterStackOffsets
;
676 STORE(vSrc
, pSrcArrayPtr
);
677 STORE(vOffsets
, pOffsetsArrayPtr
);
679 // Cast to pointers for random access
680 pSrcArrayPtr
= POINTER_CAST(pSrcArrayPtr
, PointerType::get(pSrcTy
, 0));
681 pOffsetsArrayPtr
= POINTER_CAST(pOffsetsArrayPtr
, PointerType::get(mInt32Ty
, 0));
683 Value
* pMask
= VMOVMSK(vMask
);
685 // Setup loop basic block
686 BasicBlock
* pLoop
= BasicBlock::Create(mpJitMgr
->mContext
, "Scatter_Loop", pFunc
);
688 // compute first set bit
689 Value
* pIndex
= CTTZ(pMask
, C(false));
691 Value
* pIsUndef
= ICMP_EQ(pIndex
, C(32));
693 // Split current block or create new one if building inline
694 BasicBlock
* pPostLoop
;
695 if (pCurBB
->getTerminator())
697 pPostLoop
= pCurBB
->splitBasicBlock(cast
<Instruction
>(pIsUndef
)->getNextNode());
699 // Remove unconditional jump created by splitBasicBlock
700 pCurBB
->getTerminator()->eraseFromParent();
702 // Add terminator to end of original block
703 IRB()->SetInsertPoint(pCurBB
);
705 // Add conditional branch
706 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
710 pPostLoop
= BasicBlock::Create(mpJitMgr
->mContext
, "PostScatter_Loop", pFunc
);
712 // Add conditional branch
713 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
716 // Add loop basic block contents
717 IRB()->SetInsertPoint(pLoop
);
718 PHINode
* pIndexPhi
= PHI(mInt32Ty
, 2);
719 PHINode
* pMaskPhi
= PHI(mInt32Ty
, 2);
721 pIndexPhi
->addIncoming(pIndex
, pCurBB
);
722 pMaskPhi
->addIncoming(pMask
, pCurBB
);
724 // Extract elements for this index
725 Value
* pSrcElem
= LOADV(pSrcArrayPtr
, {pIndexPhi
});
726 Value
* pOffsetElem
= LOADV(pOffsetsArrayPtr
, {pIndexPhi
});
728 // GEP to this offset in dst
729 Value
* pCurDst
= GEP(pDst
, pOffsetElem
, mInt8PtrTy
);
730 pCurDst
= POINTER_CAST(pCurDst
, PointerType::get(pSrcTy
, 0));
731 STORE(pSrcElem
, pCurDst
);
734 Value
* pNewMask
= AND(pMaskPhi
, NOT(SHL(C(1), pIndexPhi
)));
737 Value
* pNewIndex
= CTTZ(pNewMask
, C(false));
739 pIsUndef
= ICMP_EQ(pNewIndex
, C(32));
740 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
743 pIndexPhi
->addIncoming(pNewIndex
, pLoop
);
744 pMaskPhi
->addIncoming(pNewMask
, pLoop
);
746 // Move builder to beginning of post loop
747 IRB()->SetInsertPoint(pPostLoop
, pPostLoop
->begin());
749 } // namespace SwrJit