065d7fa0afdc996e66e17d948372d63ce6275163
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file builder_misc.cpp
25 * @brief Implementation for miscellaneous builder functions
29 ******************************************************************************/
30 #include "jit_pch.hpp"
37 void Builder::AssertMemoryUsageParams(Value
* ptr
, MEM_CLIENT usage
)
40 ptr
->getType() != mInt64Ty
,
41 "Address appears to be GFX access. Requires translation through BuilderGfxMem.");
44 Value
* Builder::GEP(Value
* Ptr
, Value
* Idx
, Type
* Ty
, bool isReadOnly
, const Twine
& Name
)
46 return IRB()->CreateGEP(Ptr
, Idx
, Name
);
49 Value
* Builder::GEP(Type
* Ty
, Value
* Ptr
, Value
* Idx
, const Twine
& Name
)
51 return IRB()->CreateGEP(Ty
, Ptr
, Idx
, Name
);
54 Value
* Builder::GEP(Value
* ptr
, const std::initializer_list
<Value
*>& indexList
, Type
* Ty
)
56 std::vector
<Value
*> indices
;
57 for (auto i
: indexList
)
59 return GEPA(ptr
, indices
);
62 Value
* Builder::GEP(Value
* ptr
, const std::initializer_list
<uint32_t>& indexList
, Type
* Ty
)
64 std::vector
<Value
*> indices
;
65 for (auto i
: indexList
)
66 indices
.push_back(C(i
));
67 return GEPA(ptr
, indices
);
70 Value
* Builder::GEPA(Value
* Ptr
, ArrayRef
<Value
*> IdxList
, const Twine
& Name
)
72 return IRB()->CreateGEP(Ptr
, IdxList
, Name
);
75 Value
* Builder::GEPA(Type
* Ty
, Value
* Ptr
, ArrayRef
<Value
*> IdxList
, const Twine
& Name
)
77 return IRB()->CreateGEP(Ty
, Ptr
, IdxList
, Name
);
80 Value
* Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<Value
*>& indexList
)
82 std::vector
<Value
*> indices
;
83 for (auto i
: indexList
)
85 return IN_BOUNDS_GEP(ptr
, indices
);
88 Value
* Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<uint32_t>& indexList
)
90 std::vector
<Value
*> indices
;
91 for (auto i
: indexList
)
92 indices
.push_back(C(i
));
93 return IN_BOUNDS_GEP(ptr
, indices
);
96 LoadInst
* Builder::LOAD(Value
* Ptr
, const char* Name
, Type
* Ty
, MEM_CLIENT usage
)
98 AssertMemoryUsageParams(Ptr
, usage
);
99 return IRB()->CreateLoad(Ptr
, Name
);
102 LoadInst
* Builder::LOAD(Value
* Ptr
, const Twine
& Name
, Type
* Ty
, MEM_CLIENT usage
)
104 AssertMemoryUsageParams(Ptr
, usage
);
105 return IRB()->CreateLoad(Ptr
, Name
);
108 LoadInst
* Builder::LOAD(Type
* Ty
, Value
* Ptr
, const Twine
& Name
, MEM_CLIENT usage
)
110 AssertMemoryUsageParams(Ptr
, usage
);
111 return IRB()->CreateLoad(Ty
, Ptr
, Name
);
115 Builder::LOAD(Value
* Ptr
, bool isVolatile
, const Twine
& Name
, Type
* Ty
, MEM_CLIENT usage
)
117 AssertMemoryUsageParams(Ptr
, usage
);
118 return IRB()->CreateLoad(Ptr
, isVolatile
, Name
);
121 LoadInst
* Builder::LOAD(Value
* basePtr
,
122 const std::initializer_list
<uint32_t>& indices
,
123 const llvm::Twine
& name
,
127 std::vector
<Value
*> valIndices
;
128 for (auto i
: indices
)
129 valIndices
.push_back(C(i
));
130 return Builder::LOAD(GEPA(basePtr
, valIndices
), name
);
133 LoadInst
* Builder::LOADV(Value
* basePtr
,
134 const std::initializer_list
<Value
*>& indices
,
135 const llvm::Twine
& name
)
137 std::vector
<Value
*> valIndices
;
138 for (auto i
: indices
)
139 valIndices
.push_back(i
);
140 return LOAD(GEPA(basePtr
, valIndices
), name
);
144 Builder::STORE(Value
* val
, Value
* basePtr
, const std::initializer_list
<uint32_t>& indices
, Type
* Ty
, MEM_CLIENT usage
)
146 std::vector
<Value
*> valIndices
;
147 for (auto i
: indices
)
148 valIndices
.push_back(C(i
));
149 return STORE(val
, GEPA(basePtr
, valIndices
));
153 Builder::STOREV(Value
* val
, Value
* basePtr
, const std::initializer_list
<Value
*>& indices
)
155 std::vector
<Value
*> valIndices
;
156 for (auto i
: indices
)
157 valIndices
.push_back(i
);
158 return STORE(val
, GEPA(basePtr
, valIndices
));
161 Value
* Builder::OFFSET_TO_NEXT_COMPONENT(Value
* base
, Constant
* offset
)
163 return GEP(base
, offset
);
166 Value
* Builder::MEM_ADD(Value
* i32Incr
,
168 const std::initializer_list
<uint32_t>& indices
,
169 const llvm::Twine
& name
)
171 Value
* i32Value
= LOAD(GEP(basePtr
, indices
), name
);
172 Value
* i32Result
= ADD(i32Value
, i32Incr
);
173 return STORE(i32Result
, GEP(basePtr
, indices
));
176 //////////////////////////////////////////////////////////////////////////
177 /// @brief Generate a masked gather operation in LLVM IR. If not
178 /// supported on the underlying platform, emulate it with loads
179 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
180 /// @param pBase - Int8* base VB address pointer value
181 /// @param vIndices - SIMD wide value of VB byte offsets
182 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
183 /// @param scale - value to scale indices by
184 Value
* Builder::GATHERPS(Value
* vSrc
,
191 AssertMemoryUsageParams(pBase
, usage
);
193 return VGATHERPS(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
196 //////////////////////////////////////////////////////////////////////////
197 /// @brief Generate a masked gather operation in LLVM IR. If not
198 /// supported on the underlying platform, emulate it with loads
199 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
200 /// @param pBase - Int8* base VB address pointer value
201 /// @param vIndices - SIMD wide value of VB byte offsets
202 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
203 /// @param scale - value to scale indices by
204 Value
* Builder::GATHERDD(Value
* vSrc
,
211 AssertMemoryUsageParams(pBase
, usage
);
213 return VGATHERDD(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
216 //////////////////////////////////////////////////////////////////////////
217 /// @brief Generate a masked gather operation in LLVM IR. If not
218 /// supported on the underlying platform, emulate it with loads
219 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
220 /// @param pBase - Int8* base VB address pointer value
221 /// @param vIndices - SIMD wide value of VB byte offsets
222 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
223 /// @param scale - value to scale indices by
225 Builder::GATHERPD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
)
227 return VGATHERPD(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
230 //////////////////////////////////////////////////////////////////////////
231 /// @brief Alternative masked gather where source is a vector of pointers
232 /// @param pVecSrcPtr - SIMD wide vector of pointers
233 /// @param pVecMask - SIMD active lanes
234 /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
235 Value
* Builder::GATHER_PTR(Value
* pVecSrcPtr
, Value
* pVecMask
, Value
* pVecPassthru
)
237 return MASKED_GATHER(pVecSrcPtr
, AlignType(4), pVecMask
, pVecPassthru
);
240 void Builder::SCATTER_PTR(Value
* pVecDstPtr
, Value
* pVecSrc
, Value
* pVecMask
)
242 MASKED_SCATTER(pVecSrc
, pVecDstPtr
, AlignType(4), pVecMask
);
245 void Builder::Gather4(const SWR_FORMAT format
,
249 Value
* vGatherComponents
[],
253 const SWR_FORMAT_INFO
& info
= GetFormatInfo(format
);
254 if (info
.type
[0] == SWR_TYPE_FLOAT
&& info
.bpc
[0] == 32)
256 GATHER4PS(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
, usage
);
260 GATHER4DD(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
, usage
);
264 void Builder::GATHER4PS(const SWR_FORMAT_INFO
& info
,
268 Value
* vGatherComponents
[],
272 switch (info
.bpp
/ info
.numComps
)
276 Value
* vGatherResult
[2];
278 // TODO: vGatherMaskedVal
279 Value
* vGatherMaskedVal
= VIMMED1((float)0);
281 // always have at least one component out of x or y to fetch
283 vGatherResult
[0] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
284 // e.g. result of first 8x32bit integer gather for 16bit components
285 // 256i - 0 1 2 3 4 5 6 7
286 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
289 // if we have at least one component out of x or y to fetch
290 if (info
.numComps
> 2)
292 // offset base to the next components(zw) in the vertex to gather
293 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
296 GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
297 // e.g. result of second 8x32bit integer gather for 16bit components
298 // 256i - 0 1 2 3 4 5 6 7
299 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
304 vGatherResult
[1] = vGatherMaskedVal
;
307 // Shuffle gathered components into place, each row is a component
308 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
314 for (uint32_t i
= 0; i
< 4; ++i
)
316 vGatherComponents
[i
] = VIMMED1(*(float*)&info
.defaults
[i
]);
319 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
321 uint32_t swizzleIndex
= info
.swizzle
[i
];
323 // Gather a SIMD of components
324 vGatherComponents
[swizzleIndex
] = GATHERPS(
325 vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, 1, usage
);
327 // offset base to the next component to gather
328 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
333 SWR_INVALID("Invalid float format");
338 void Builder::GATHER4DD(const SWR_FORMAT_INFO
& info
,
342 Value
* vGatherComponents
[],
346 switch (info
.bpp
/ info
.numComps
)
350 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
351 Value
* vGatherResult
=
352 GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
353 // e.g. result of an 8x32bit integer gather for 8bit components
354 // 256i - 0 1 2 3 4 5 6 7
355 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
357 Shuffle8bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
362 Value
* vGatherResult
[2];
364 // TODO: vGatherMaskedVal
365 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
367 // always have at least one component out of x or y to fetch
369 vGatherResult
[0] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
370 // e.g. result of first 8x32bit integer gather for 16bit components
371 // 256i - 0 1 2 3 4 5 6 7
372 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
375 // if we have at least one component out of x or y to fetch
376 if (info
.numComps
> 2)
378 // offset base to the next components(zw) in the vertex to gather
379 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
382 GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
383 // e.g. result of second 8x32bit integer gather for 16bit components
384 // 256i - 0 1 2 3 4 5 6 7
385 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
390 vGatherResult
[1] = vGatherMaskedVal
;
393 // Shuffle gathered components into place, each row is a component
394 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
400 for (uint32_t i
= 0; i
< 4; ++i
)
402 vGatherComponents
[i
] = VIMMED1((int)info
.defaults
[i
]);
405 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
407 uint32_t swizzleIndex
= info
.swizzle
[i
];
409 // Gather a SIMD of components
410 vGatherComponents
[swizzleIndex
] = GATHERDD(
411 vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, 1, usage
);
413 // offset base to the next component to gather
414 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
419 SWR_INVALID("unsupported format");
424 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO
& info
,
425 Value
* vGatherInput
[2],
426 Value
* vGatherOutput
[4],
430 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
431 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
433 // input could either be float or int vector; do shuffle work in int
434 vGatherInput
[0] = BITCAST(vGatherInput
[0], mSimdInt32Ty
);
435 vGatherInput
[1] = BITCAST(vGatherInput
[1], mSimdInt32Ty
);
439 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128),
440 mVWidth
/ 4); // vwidth is units of 32 bits
443 Value
* vConstMask
= C
<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
444 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
446 BITCAST(PSHUFB(BITCAST(vGatherInput
[0], v32x8Ty
), vConstMask
), vGatherTy
);
447 // after pshufb: group components together in each 128bit lane
448 // 256i - 0 1 2 3 4 5 6 7
449 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
452 BITCAST(VPERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
453 // after PERMD: move and pack xy components into each 128bit lane
454 // 256i - 0 1 2 3 4 5 6 7
455 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
457 // do the same for zw components
458 Value
* vi128ZW
= nullptr;
459 if (info
.numComps
> 2)
462 BITCAST(PSHUFB(BITCAST(vGatherInput
[1], v32x8Ty
), vConstMask
), vGatherTy
);
464 BITCAST(VPERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
467 for (uint32_t i
= 0; i
< 4; i
++)
469 uint32_t swizzleIndex
= info
.swizzle
[i
];
470 // todo: fixed for packed
471 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
472 if (i
>= info
.numComps
)
474 // set the default component val
475 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
479 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
480 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
481 // if x or y, use vi128XY permute result, else use vi128ZW
482 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
484 // extract packed component 128 bit lanes
485 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
490 // pshufb masks for each component
491 Value
* vConstMask
[2];
493 vConstMask
[0] = C
<char>({
494 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
495 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
499 vConstMask
[1] = C
<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
500 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
502 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
504 for (uint32_t i
= 0; i
< 4; ++i
)
506 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
509 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
511 uint32_t swizzleIndex
= info
.swizzle
[i
];
513 // select correct constMask for x/z or y/w pshufb
514 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
515 // if x or y, use vi128XY permute result, else use vi128ZW
516 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
518 vGatherOutput
[swizzleIndex
] =
519 BITCAST(PSHUFB(BITCAST(vGatherInput
[selectedGather
], v32x8Ty
),
520 vConstMask
[selectedMask
]),
522 // after pshufb mask for x channel; z uses the same shuffle from the second gather
523 // 256i - 0 1 2 3 4 5 6 7
524 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
529 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO
& info
,
531 Value
* vGatherOutput
[],
535 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
536 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
540 Type
* v128Ty
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128),
541 mVWidth
/ 4); // vwidth is units of 32 bits
543 Value
* vConstMask
= C
<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
544 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
546 BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
547 // after pshufb: group components together in each 128bit lane
548 // 256i - 0 1 2 3 4 5 6 7
549 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
552 BITCAST(VPERMD(vShufResult
, C
<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty
);
553 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
554 // 256i - 0 1 2 3 4 5 6 7
555 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
557 // do the same for zw components
558 Value
* vi128ZW
= nullptr;
559 if (info
.numComps
> 2)
562 BITCAST(VPERMD(vShufResult
, C
<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty
);
565 // sign extend all enabled components. If we have a fill vVertexElements, output to
566 // current simdvertex
567 for (uint32_t i
= 0; i
< 4; i
++)
569 uint32_t swizzleIndex
= info
.swizzle
[i
];
570 // todo: fix for packed
571 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
572 if (i
>= info
.numComps
)
574 // set the default component val
575 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
579 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
580 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
581 // if x or y, use vi128XY permute result, else use vi128ZW
582 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
585 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
591 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
593 for (uint32_t i
= 0; i
< 4; ++i
)
595 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
598 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
600 uint32_t swizzleIndex
= info
.swizzle
[i
];
602 // pshufb masks for each component
609 C
<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
610 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
615 C
<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
616 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
621 C
<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
622 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
627 C
<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
628 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
631 vConstMask
= nullptr;
635 assert(vConstMask
&& "Invalid info.numComps value");
636 vGatherOutput
[swizzleIndex
] =
637 BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
638 // after pshufb for x channel
639 // 256i - 0 1 2 3 4 5 6 7
640 // x000 x000 x000 x000 x000 x000 x000 x000
645 //////////////////////////////////////////////////////////////////////////
646 /// @brief emulates a scatter operation.
647 /// @param pDst - pointer to destination
648 /// @param vSrc - vector of src data to scatter
649 /// @param vOffsets - vector of byte offsets from pDst
650 /// @param vMask - mask of valid lanes
651 void Builder::SCATTERPS(
652 Value
* pDst
, Value
* vSrc
, Value
* vOffsets
, Value
* vMask
, MEM_CLIENT usage
)
654 AssertMemoryUsageParams(pDst
, usage
);
655 #if LLVM_VERSION_MAJOR >= 11
656 SWR_ASSERT(cast
<VectorType
>(vSrc
->getType())->getElementType()->isFloatTy());
658 SWR_ASSERT(vSrc
->getType()->getVectorElementType()->isFloatTy());
660 VSCATTERPS(pDst
, vMask
, vOffsets
, vSrc
, C(1));
665 while(Index = BitScanForward(mask))
666 srcElem = srcVector[Index]
667 offsetElem = offsetVector[Index]
668 *(pDst + offsetElem) = srcElem
669 Update mask (&= ~(1<<Index)
675 // Reference implementation kept around for reference
677 BasicBlock* pCurBB = IRB()->GetInsertBlock();
678 Function* pFunc = pCurBB->getParent();
679 Type* pSrcTy = vSrc->getType()->getVectorElementType();
681 // Store vectors on stack
682 if (pScatterStackSrc == nullptr)
684 // Save off stack allocations and reuse per scatter. Significantly reduces stack
685 // requirements for shaders with a lot of scatters.
686 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
687 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
690 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
691 Value* pOffsetsArrayPtr = pScatterStackOffsets;
692 STORE(vSrc, pSrcArrayPtr);
693 STORE(vOffsets, pOffsetsArrayPtr);
695 // Cast to pointers for random access
696 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
697 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
699 Value* pMask = VMOVMSK(vMask);
701 // Setup loop basic block
702 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
704 // compute first set bit
705 Value* pIndex = CTTZ(pMask, C(false));
707 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
709 // Split current block or create new one if building inline
710 BasicBlock* pPostLoop;
711 if (pCurBB->getTerminator())
713 pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
715 // Remove unconditional jump created by splitBasicBlock
716 pCurBB->getTerminator()->eraseFromParent();
718 // Add terminator to end of original block
719 IRB()->SetInsertPoint(pCurBB);
721 // Add conditional branch
722 COND_BR(pIsUndef, pPostLoop, pLoop);
726 pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
728 // Add conditional branch
729 COND_BR(pIsUndef, pPostLoop, pLoop);
732 // Add loop basic block contents
733 IRB()->SetInsertPoint(pLoop);
734 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
735 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
737 pIndexPhi->addIncoming(pIndex, pCurBB);
738 pMaskPhi->addIncoming(pMask, pCurBB);
740 // Extract elements for this index
741 Value* pSrcElem = LOADV(pSrcArrayPtr, {pIndexPhi});
742 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
744 // GEP to this offset in dst
745 Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
746 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
747 STORE(pSrcElem, pCurDst);
750 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
753 Value* pNewIndex = CTTZ(pNewMask, C(false));
755 pIsUndef = ICMP_EQ(pNewIndex, C(32));
756 COND_BR(pIsUndef, pPostLoop, pLoop);
759 pIndexPhi->addIncoming(pNewIndex, pLoop);
760 pMaskPhi->addIncoming(pNewMask, pLoop);
762 // Move builder to beginning of post loop
763 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
767 } // namespace SwrJit