1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file builder_misc.cpp
25 * @brief Implementation for miscellaneous builder functions
29 ******************************************************************************/
30 #include "jit_pch.hpp"
32 #include "common/rdtsc_buckets.h"
39 void Builder::AssertMemoryUsageParams(Value
* ptr
, JIT_MEM_CLIENT usage
)
41 SWR_ASSERT(ptr
->getType() != mInt64Ty
, "Address appears to be GFX access. Requires translation through BuilderGfxMem.");
44 Value
*Builder::GEP(Value
*Ptr
, Value
*Idx
, Type
*Ty
, const Twine
&Name
)
46 return IRB()->CreateGEP(Ptr
, Idx
, Name
);
49 Value
*Builder::GEP(Type
*Ty
, Value
*Ptr
, Value
*Idx
, const Twine
&Name
)
51 return IRB()->CreateGEP(Ty
, Ptr
, Idx
, Name
);
54 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
, Type
*Ty
)
56 std::vector
<Value
*> indices
;
57 for (auto i
: indexList
)
59 return GEPA(ptr
, indices
);
62 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
, Type
*Ty
)
64 std::vector
<Value
*> indices
;
65 for (auto i
: indexList
)
66 indices
.push_back(C(i
));
67 return GEPA(ptr
, indices
);
70 Value
*Builder::GEPA(Value
*Ptr
, ArrayRef
<Value
*> IdxList
, const Twine
&Name
)
72 return IRB()->CreateGEP(Ptr
, IdxList
, Name
);
75 Value
*Builder::GEPA(Type
*Ty
, Value
*Ptr
, ArrayRef
<Value
*> IdxList
, const Twine
&Name
)
77 return IRB()->CreateGEP(Ty
, Ptr
, IdxList
, Name
);
80 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
82 std::vector
<Value
*> indices
;
83 for (auto i
: indexList
)
85 return IN_BOUNDS_GEP(ptr
, indices
);
88 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
90 std::vector
<Value
*> indices
;
91 for (auto i
: indexList
)
92 indices
.push_back(C(i
));
93 return IN_BOUNDS_GEP(ptr
, indices
);
96 LoadInst
* Builder::LOAD(Value
*Ptr
, const char *Name
, Type
*Ty
, JIT_MEM_CLIENT usage
)
98 AssertMemoryUsageParams(Ptr
, usage
);
99 return IRB()->CreateLoad(Ptr
, Name
);
102 LoadInst
* Builder::LOAD(Value
*Ptr
, const Twine
&Name
, Type
*Ty
, JIT_MEM_CLIENT usage
)
104 AssertMemoryUsageParams(Ptr
, usage
);
105 return IRB()->CreateLoad(Ptr
, Name
);
108 LoadInst
* Builder::LOAD(Type
*Ty
, Value
*Ptr
, const Twine
&Name
, JIT_MEM_CLIENT usage
)
110 AssertMemoryUsageParams(Ptr
, usage
);
111 return IRB()->CreateLoad(Ty
, Ptr
, Name
);
114 LoadInst
* Builder::LOAD(Value
*Ptr
, bool isVolatile
, const Twine
&Name
, Type
*Ty
, JIT_MEM_CLIENT usage
)
116 AssertMemoryUsageParams(Ptr
, usage
);
117 return IRB()->CreateLoad(Ptr
, isVolatile
, Name
);
120 LoadInst
*Builder::LOAD(Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
, const llvm::Twine
& name
, Type
*Ty
, JIT_MEM_CLIENT usage
)
122 std::vector
<Value
*> valIndices
;
123 for (auto i
: indices
)
124 valIndices
.push_back(C(i
));
125 return Builder::LOAD(GEPA(basePtr
, valIndices
), name
);
128 LoadInst
*Builder::LOADV(Value
*basePtr
, const std::initializer_list
<Value
*> &indices
, const llvm::Twine
& name
)
130 std::vector
<Value
*> valIndices
;
131 for (auto i
: indices
)
132 valIndices
.push_back(i
);
133 return LOAD(GEPA(basePtr
, valIndices
), name
);
136 StoreInst
*Builder::STORE(Value
*val
, Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
)
138 std::vector
<Value
*> valIndices
;
139 for (auto i
: indices
)
140 valIndices
.push_back(C(i
));
141 return STORE(val
, GEPA(basePtr
, valIndices
));
144 StoreInst
*Builder::STOREV(Value
*val
, Value
*basePtr
, const std::initializer_list
<Value
*> &indices
)
146 std::vector
<Value
*> valIndices
;
147 for (auto i
: indices
)
148 valIndices
.push_back(i
);
149 return STORE(val
, GEPA(basePtr
, valIndices
));
152 Value
* Builder::OFFSET_TO_NEXT_COMPONENT(Value
* base
, Constant
*offset
)
154 return GEP(base
, offset
);
157 Value
* Builder::MEM_ADD(Value
* i32Incr
, Value
* basePtr
, const std::initializer_list
<uint32_t> &indices
, const llvm::Twine
& name
)
159 Value
* i32Value
= LOAD(GEP(basePtr
, indices
), name
);
160 Value
* i32Result
= ADD(i32Value
, i32Incr
);
161 return STORE(i32Result
, GEP(basePtr
, indices
));
164 //////////////////////////////////////////////////////////////////////////
165 /// @brief Generate a masked gather operation in LLVM IR. If not
166 /// supported on the underlying platform, emulate it with loads
167 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
168 /// @param pBase - Int8* base VB address pointer value
169 /// @param vIndices - SIMD wide value of VB byte offsets
170 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
171 /// @param scale - value to scale indices by
172 Value
*Builder::GATHERPS(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
, JIT_MEM_CLIENT usage
)
174 AssertMemoryUsageParams(pBase
, usage
);
176 return VGATHERPS(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
179 //////////////////////////////////////////////////////////////////////////
180 /// @brief Generate a masked gather operation in LLVM IR. If not
181 /// supported on the underlying platform, emulate it with loads
182 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
183 /// @param pBase - Int8* base VB address pointer value
184 /// @param vIndices - SIMD wide value of VB byte offsets
185 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
186 /// @param scale - value to scale indices by
187 Value
*Builder::GATHERDD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
, JIT_MEM_CLIENT usage
)
189 AssertMemoryUsageParams(pBase
, usage
);
191 return VGATHERDD(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
194 //////////////////////////////////////////////////////////////////////////
195 /// @brief Generate a masked gather operation in LLVM IR. If not
196 /// supported on the underlying platform, emulate it with loads
197 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
198 /// @param pBase - Int8* base VB address pointer value
199 /// @param vIndices - SIMD wide value of VB byte offsets
200 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
201 /// @param scale - value to scale indices by
202 Value
*Builder::GATHERPD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
)
204 return VGATHERPD(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
207 //////////////////////////////////////////////////////////////////////////
208 /// @brief Alternative masked gather where source is a vector of pointers
209 /// @param pVecSrcPtr - SIMD wide vector of pointers
210 /// @param pVecMask - SIMD active lanes
211 /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
212 Value
* Builder::GATHER_PTR(Value
* pVecSrcPtr
, Value
* pVecMask
, Value
* pVecPassthru
)
214 return MASKED_GATHER(pVecSrcPtr
, 4, pVecMask
, pVecPassthru
);
217 void Builder::Gather4(const SWR_FORMAT format
, Value
* pSrcBase
, Value
* byteOffsets
,
218 Value
* mask
, Value
* vGatherComponents
[], bool bPackedOutput
, JIT_MEM_CLIENT usage
)
220 const SWR_FORMAT_INFO
&info
= GetFormatInfo(format
);
221 if (info
.type
[0] == SWR_TYPE_FLOAT
&& info
.bpc
[0] == 32)
223 GATHER4PS(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
, usage
);
227 GATHER4DD(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
, usage
);
231 void Builder::GATHER4PS(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
232 Value
* vMask
, Value
* vGatherComponents
[], bool bPackedOutput
, JIT_MEM_CLIENT usage
)
234 switch (info
.bpp
/ info
.numComps
)
238 Value
* vGatherResult
[2];
240 // TODO: vGatherMaskedVal
241 Value
* vGatherMaskedVal
= VIMMED1((float)0);
243 // always have at least one component out of x or y to fetch
245 vGatherResult
[0] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
246 // e.g. result of first 8x32bit integer gather for 16bit components
247 // 256i - 0 1 2 3 4 5 6 7
248 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
251 // if we have at least one component out of x or y to fetch
252 if (info
.numComps
> 2)
254 // offset base to the next components(zw) in the vertex to gather
255 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
257 vGatherResult
[1] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
258 // e.g. result of second 8x32bit integer gather for 16bit components
259 // 256i - 0 1 2 3 4 5 6 7
260 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
265 vGatherResult
[1] = vGatherMaskedVal
;
268 // Shuffle gathered components into place, each row is a component
269 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
275 for (uint32_t i
= 0; i
< 4; ++i
)
277 vGatherComponents
[i
] = VIMMED1(*(float*)&info
.defaults
[i
]);
280 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
282 uint32_t swizzleIndex
= info
.swizzle
[i
];
284 // Gather a SIMD of components
285 vGatherComponents
[swizzleIndex
] = GATHERPS(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, 1, usage
);
287 // offset base to the next component to gather
288 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
293 SWR_INVALID("Invalid float format");
298 void Builder::GATHER4DD(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
299 Value
* vMask
, Value
* vGatherComponents
[], bool bPackedOutput
, JIT_MEM_CLIENT usage
)
301 switch (info
.bpp
/ info
.numComps
)
305 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
306 Value
* vGatherResult
= GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
307 // e.g. result of an 8x32bit integer gather for 8bit components
308 // 256i - 0 1 2 3 4 5 6 7
309 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
311 Shuffle8bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
316 Value
* vGatherResult
[2];
318 // TODO: vGatherMaskedVal
319 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
321 // always have at least one component out of x or y to fetch
323 vGatherResult
[0] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
324 // e.g. result of first 8x32bit integer gather for 16bit components
325 // 256i - 0 1 2 3 4 5 6 7
326 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
329 // if we have at least one component out of x or y to fetch
330 if (info
.numComps
> 2)
332 // offset base to the next components(zw) in the vertex to gather
333 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
335 vGatherResult
[1] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, 1, usage
);
336 // e.g. result of second 8x32bit integer gather for 16bit components
337 // 256i - 0 1 2 3 4 5 6 7
338 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
343 vGatherResult
[1] = vGatherMaskedVal
;
346 // Shuffle gathered components into place, each row is a component
347 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
354 for (uint32_t i
= 0; i
< 4; ++i
)
356 vGatherComponents
[i
] = VIMMED1((int)info
.defaults
[i
]);
359 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
361 uint32_t swizzleIndex
= info
.swizzle
[i
];
363 // Gather a SIMD of components
364 vGatherComponents
[swizzleIndex
] = GATHERDD(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, 1, usage
);
366 // offset base to the next component to gather
367 pSrcBase
= OFFSET_TO_NEXT_COMPONENT(pSrcBase
, C((intptr_t)4));
372 SWR_INVALID("unsupported format");
377 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
[2], Value
* vGatherOutput
[4], bool bPackedOutput
)
380 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
381 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
383 // input could either be float or int vector; do shuffle work in int
384 vGatherInput
[0] = BITCAST(vGatherInput
[0], mSimdInt32Ty
);
385 vGatherInput
[1] = BITCAST(vGatherInput
[1], mSimdInt32Ty
);
389 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
392 Value
* vConstMask
= C
<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
393 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
394 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[0], v32x8Ty
), vConstMask
), vGatherTy
);
395 // after pshufb: group components together in each 128bit lane
396 // 256i - 0 1 2 3 4 5 6 7
397 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
399 Value
* vi128XY
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
400 // after PERMD: move and pack xy components into each 128bit lane
401 // 256i - 0 1 2 3 4 5 6 7
402 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
404 // do the same for zw components
405 Value
* vi128ZW
= nullptr;
406 if (info
.numComps
> 2)
408 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[1], v32x8Ty
), vConstMask
), vGatherTy
);
409 vi128ZW
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
412 for (uint32_t i
= 0; i
< 4; i
++)
414 uint32_t swizzleIndex
= info
.swizzle
[i
];
415 // todo: fixed for packed
416 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
417 if (i
>= info
.numComps
)
419 // set the default component val
420 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
424 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
425 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
426 // if x or y, use vi128XY permute result, else use vi128ZW
427 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
429 // extract packed component 128 bit lanes
430 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
436 // pshufb masks for each component
437 Value
* vConstMask
[2];
439 vConstMask
[0] = C
<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
440 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
443 vConstMask
[1] = C
<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
444 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
447 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
449 for (uint32_t i
= 0; i
< 4; ++i
)
451 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
454 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
456 uint32_t swizzleIndex
= info
.swizzle
[i
];
458 // select correct constMask for x/z or y/w pshufb
459 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
460 // if x or y, use vi128XY permute result, else use vi128ZW
461 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
463 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
[selectedGather
], v32x8Ty
), vConstMask
[selectedMask
]), vGatherTy
);
464 // after pshufb mask for x channel; z uses the same shuffle from the second gather
465 // 256i - 0 1 2 3 4 5 6 7
466 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
471 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
, Value
* vGatherOutput
[], bool bPackedOutput
)
474 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
475 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
479 Type
* v128Ty
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
481 Value
* vConstMask
= C
<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
482 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
483 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
484 // after pshufb: group components together in each 128bit lane
485 // 256i - 0 1 2 3 4 5 6 7
486 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
488 Value
* vi128XY
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty
);
489 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
490 // 256i - 0 1 2 3 4 5 6 7
491 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
493 // do the same for zw components
494 Value
* vi128ZW
= nullptr;
495 if (info
.numComps
> 2)
497 vi128ZW
= BITCAST(VPERMD(vShufResult
, C
<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty
);
500 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
501 for (uint32_t i
= 0; i
< 4; i
++)
503 uint32_t swizzleIndex
= info
.swizzle
[i
];
504 // todo: fix for packed
505 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
506 if (i
>= info
.numComps
)
508 // set the default component val
509 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
513 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
514 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
515 // if x or y, use vi128XY permute result, else use vi128ZW
516 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
519 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
524 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
526 for (uint32_t i
= 0; i
< 4; ++i
)
528 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
531 for (uint32_t i
= 0; i
< info
.numComps
; i
++) {
532 uint32_t swizzleIndex
= info
.swizzle
[i
];
534 // pshufb masks for each component
540 vConstMask
= C
<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
541 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
545 vConstMask
= C
<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
546 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
550 vConstMask
= C
<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
551 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
555 vConstMask
= C
<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
556 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
559 vConstMask
= nullptr;
563 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
564 // after pshufb for x channel
565 // 256i - 0 1 2 3 4 5 6 7
566 // x000 x000 x000 x000 x000 x000 x000 x000
571 //////////////////////////////////////////////////////////////////////////
572 /// @brief emulates a scatter operation.
573 /// @param pDst - pointer to destination
574 /// @param vSrc - vector of src data to scatter
575 /// @param vOffsets - vector of byte offsets from pDst
576 /// @param vMask - mask of valid lanes
577 void Builder::SCATTERPS(Value
* pDst
, Value
* vSrc
, Value
* vOffsets
, Value
* vMask
)
581 while(Index = BitScanForward(mask))
582 srcElem = srcVector[Index]
583 offsetElem = offsetVector[Index]
584 *(pDst + offsetElem) = srcElem
585 Update mask (&= ~(1<<Index)
589 BasicBlock
* pCurBB
= IRB()->GetInsertBlock();
590 Function
* pFunc
= pCurBB
->getParent();
591 Type
* pSrcTy
= vSrc
->getType()->getVectorElementType();
593 // Store vectors on stack
594 if (pScatterStackSrc
== nullptr)
596 // Save off stack allocations and reuse per scatter. Significantly reduces stack
597 // requirements for shaders with a lot of scatters.
598 pScatterStackSrc
= CreateEntryAlloca(pFunc
, mSimdInt64Ty
);
599 pScatterStackOffsets
= CreateEntryAlloca(pFunc
, mSimdInt32Ty
);
602 Value
* pSrcArrayPtr
= BITCAST(pScatterStackSrc
, PointerType::get(vSrc
->getType(), 0));
603 Value
* pOffsetsArrayPtr
= pScatterStackOffsets
;
604 STORE(vSrc
, pSrcArrayPtr
);
605 STORE(vOffsets
, pOffsetsArrayPtr
);
607 // Cast to pointers for random access
608 pSrcArrayPtr
= POINTER_CAST(pSrcArrayPtr
, PointerType::get(pSrcTy
, 0));
609 pOffsetsArrayPtr
= POINTER_CAST(pOffsetsArrayPtr
, PointerType::get(mInt32Ty
, 0));
611 Value
* pMask
= VMOVMSKPS(BITCAST(vMask
, mSimdFP32Ty
));
613 // Setup loop basic block
614 BasicBlock
* pLoop
= BasicBlock::Create(mpJitMgr
->mContext
, "Scatter_Loop", pFunc
);
616 // compute first set bit
617 Value
* pIndex
= CTTZ(pMask
, C(false));
619 Value
* pIsUndef
= ICMP_EQ(pIndex
, C(32));
621 // Split current block
622 BasicBlock
* pPostLoop
= pCurBB
->splitBasicBlock(cast
<Instruction
>(pIsUndef
)->getNextNode());
624 // Remove unconditional jump created by splitBasicBlock
625 pCurBB
->getTerminator()->eraseFromParent();
627 // Add terminator to end of original block
628 IRB()->SetInsertPoint(pCurBB
);
630 // Add conditional branch
631 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
633 // Add loop basic block contents
634 IRB()->SetInsertPoint(pLoop
);
635 PHINode
* pIndexPhi
= PHI(mInt32Ty
, 2);
636 PHINode
* pMaskPhi
= PHI(mInt32Ty
, 2);
638 pIndexPhi
->addIncoming(pIndex
, pCurBB
);
639 pMaskPhi
->addIncoming(pMask
, pCurBB
);
641 // Extract elements for this index
642 Value
* pSrcElem
= LOADV(pSrcArrayPtr
, { pIndexPhi
});
643 Value
* pOffsetElem
= LOADV(pOffsetsArrayPtr
, { pIndexPhi
});
645 // GEP to this offset in dst
646 Value
* pCurDst
= GEP(pDst
, pOffsetElem
);
647 pCurDst
= POINTER_CAST(pCurDst
, PointerType::get(pSrcTy
, 0));
648 STORE(pSrcElem
, pCurDst
);
651 Value
* pNewMask
= AND(pMaskPhi
, NOT(SHL(C(1), pIndexPhi
)));
654 Value
* pNewIndex
= CTTZ(pNewMask
, C(false));
656 pIsUndef
= ICMP_EQ(pNewIndex
, C(32));
657 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
660 pIndexPhi
->addIncoming(pNewIndex
, pLoop
);
661 pMaskPhi
->addIncoming(pNewMask
, pLoop
);
663 // Move builder to beginning of post loop
664 IRB()->SetInsertPoint(pPostLoop
, pPostLoop
->begin());