1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file builder_misc.cpp
25 * @brief Implementation for miscellaneous builder functions
29 ******************************************************************************/
30 #include "jit_pch.hpp"
32 #include "common/rdtsc_buckets.h"
40 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
42 std::vector
<Value
*> indices
;
43 for (auto i
: indexList
)
45 return GEPA(ptr
, indices
);
48 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
50 std::vector
<Value
*> indices
;
51 for (auto i
: indexList
)
52 indices
.push_back(C(i
));
53 return GEPA(ptr
, indices
);
56 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
58 std::vector
<Value
*> indices
;
59 for (auto i
: indexList
)
61 return IN_BOUNDS_GEP(ptr
, indices
);
64 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
66 std::vector
<Value
*> indices
;
67 for (auto i
: indexList
)
68 indices
.push_back(C(i
));
69 return IN_BOUNDS_GEP(ptr
, indices
);
72 LoadInst
*Builder::LOAD(Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
, const llvm::Twine
& name
)
74 std::vector
<Value
*> valIndices
;
75 for (auto i
: indices
)
76 valIndices
.push_back(C(i
));
77 return LOAD(GEPA(basePtr
, valIndices
), name
);
80 LoadInst
*Builder::LOADV(Value
*basePtr
, const std::initializer_list
<Value
*> &indices
, const llvm::Twine
& name
)
82 std::vector
<Value
*> valIndices
;
83 for (auto i
: indices
)
84 valIndices
.push_back(i
);
85 return LOAD(GEPA(basePtr
, valIndices
), name
);
88 StoreInst
*Builder::STORE(Value
*val
, Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
)
90 std::vector
<Value
*> valIndices
;
91 for (auto i
: indices
)
92 valIndices
.push_back(C(i
));
93 return STORE(val
, GEPA(basePtr
, valIndices
));
96 StoreInst
*Builder::STOREV(Value
*val
, Value
*basePtr
, const std::initializer_list
<Value
*> &indices
)
98 std::vector
<Value
*> valIndices
;
99 for (auto i
: indices
)
100 valIndices
.push_back(i
);
101 return STORE(val
, GEPA(basePtr
, valIndices
));
104 //////////////////////////////////////////////////////////////////////////
105 /// @brief Generate an i32 masked load operation in LLVM IR. If not
106 /// supported on the underlying platform, emulate it with float masked load
107 /// @param src - base address pointer for the load
108 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
109 Value
*Builder::MASKLOADD(Value
* src
, Value
* mask
)
112 // use avx2 gather instruction is available
113 if (JM()->mArch
.AVX2())
115 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_avx2_maskload_d_256
);
116 vResult
= CALL(func
, { src
,mask
});
120 // maskload intrinsic expects integer mask operand in llvm >= 3.8
121 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
122 mask
= BITCAST(mask
, VectorType::get(mInt32Ty
, mVWidth
));
124 mask
= BITCAST(mask
, VectorType::get(mFP32Ty
, mVWidth
));
126 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_avx_maskload_ps_256
);
127 vResult
= BITCAST(CALL(func
, { src
,mask
}), VectorType::get(mInt32Ty
, mVWidth
));
132 //////////////////////////////////////////////////////////////////////////
133 /// @brief Generate a masked gather operation in LLVM IR. If not
134 /// supported on the underlying platform, emulate it with loads
135 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
136 /// @param pBase - Int8* base VB address pointer value
137 /// @param vIndices - SIMD wide value of VB byte offsets
138 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
139 /// @param scale - value to scale indices by
140 Value
*Builder::GATHERPS(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
)
143 Value
*pBasePtr
= INT_TO_PTR(pBase
, PointerType::get(mInt8Ty
, 0));
145 // use avx2 gather instruction if available
146 if (JM()->mArch
.AVX2())
148 // force mask to <N x float>, required by vgather
149 Value
*mask
= BITCAST(VMASK(vMask
), mSimdFP32Ty
);
151 vGather
= VGATHERPS(vSrc
, pBasePtr
, vIndices
, mask
, C(scale
));
155 Value
* pStack
= STACKSAVE();
157 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
158 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
159 STORE(vSrc
, vSrcPtr
);
161 vGather
= VUNDEF_F();
162 Value
*vScaleVec
= VIMMED1((uint32_t)scale
);
163 Value
*vOffsets
= MUL(vIndices
, vScaleVec
);
164 for (uint32_t i
= 0; i
< mVWidth
; ++i
)
166 // single component byte index
167 Value
*offset
= VEXTRACT(vOffsets
, C(i
));
168 // byte pointer to component
169 Value
*loadAddress
= GEP(pBasePtr
, offset
);
170 loadAddress
= BITCAST(loadAddress
, PointerType::get(mFP32Ty
, 0));
171 // pointer to the value to load if we're masking off a component
172 Value
*maskLoadAddress
= GEP(vSrcPtr
, { C(0), C(i
) });
173 Value
*selMask
= VEXTRACT(vMask
, C(i
));
174 // switch in a safe address to load if we're trying to access a vertex
175 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
176 Value
*val
= LOAD(validAddress
);
177 vGather
= VINSERT(vGather
, val
, C(i
));
180 STACKRESTORE(pStack
);
186 Value
*Builder::GATHERPS_16(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
)
188 Value
*vGather
= VUNDEF_F_16();
190 // use AVX512F gather instruction if available
191 if (JM()->mArch
.AVX512F())
193 // force mask to <N-bit Integer>, required by vgather2
194 Value
*mask
= BITCAST(vMask
, mInt16Ty
);
196 vGather
= VGATHERPS_16(vSrc
, pBase
, vIndices
, mask
, C((uint32_t)scale
));
200 Value
*src0
= EXTRACT_16(vSrc
, 0);
201 Value
*src1
= EXTRACT_16(vSrc
, 1);
203 Value
*indices0
= EXTRACT_16(vIndices
, 0);
204 Value
*indices1
= EXTRACT_16(vIndices
, 1);
206 Value
*mask0
= EXTRACT_16(vMask
, 0);
207 Value
*mask1
= EXTRACT_16(vMask
, 1);
209 Value
*gather0
= GATHERPS(src0
, pBase
, indices0
, mask0
, scale
);
210 Value
*gather1
= GATHERPS(src1
, pBase
, indices1
, mask1
, scale
);
212 vGather
= JOIN_16(gather0
, gather1
);
218 //////////////////////////////////////////////////////////////////////////
219 /// @brief Generate a masked gather operation in LLVM IR. If not
220 /// supported on the underlying platform, emulate it with loads
221 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
222 /// @param pBase - Int8* base VB address pointer value
223 /// @param vIndices - SIMD wide value of VB byte offsets
224 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
225 /// @param scale - value to scale indices by
226 Value
*Builder::GATHERDD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
)
230 // use avx2 gather instruction if available
231 if (JM()->mArch
.AVX2())
233 vGather
= VGATHERDD(vSrc
, pBase
, vIndices
, VMASK(vMask
), C(scale
));
237 Value
* pStack
= STACKSAVE();
239 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
240 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
241 STORE(vSrc
, vSrcPtr
);
243 vGather
= VUNDEF_I();
244 Value
*vScaleVec
= VIMMED1((uint32_t)scale
);
245 Value
*vOffsets
= MUL(vIndices
, vScaleVec
);
246 for (uint32_t i
= 0; i
< mVWidth
; ++i
)
248 // single component byte index
249 Value
*offset
= VEXTRACT(vOffsets
, C(i
));
250 // byte pointer to component
251 Value
*loadAddress
= GEP(pBase
, offset
);
252 loadAddress
= BITCAST(loadAddress
, PointerType::get(mInt32Ty
, 0));
253 // pointer to the value to load if we're masking off a component
254 Value
*maskLoadAddress
= GEP(vSrcPtr
, { C(0), C(i
) });
255 Value
*selMask
= VEXTRACT(vMask
, C(i
));
256 // switch in a safe address to load if we're trying to access a vertex
257 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
258 Value
*val
= LOAD(validAddress
, C(0));
259 vGather
= VINSERT(vGather
, val
, C(i
));
262 STACKRESTORE(pStack
);
268 Value
*Builder::GATHERDD_16(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
)
270 Value
*vGather
= VUNDEF_I_16();
272 // use AVX512F gather instruction if available
273 if (JM()->mArch
.AVX512F())
275 // force mask to <N-bit Integer>, required by vgather2
276 Value
*mask
= BITCAST(vMask
, mInt16Ty
);
278 vGather
= VGATHERDD_16(vSrc
, pBase
, vIndices
, mask
, C((uint32_t)scale
));
282 Value
*src0
= EXTRACT_16(vSrc
, 0);
283 Value
*src1
= EXTRACT_16(vSrc
, 1);
285 Value
*indices0
= EXTRACT_16(vIndices
, 0);
286 Value
*indices1
= EXTRACT_16(vIndices
, 1);
288 Value
*mask0
= EXTRACT_16(vMask
, 0);
289 Value
*mask1
= EXTRACT_16(vMask
, 1);
291 Value
*gather0
= GATHERDD(src0
, pBase
, indices0
, mask0
, scale
);
292 Value
*gather1
= GATHERDD(src1
, pBase
, indices1
, mask1
, scale
);
294 vGather
= JOIN_16(gather0
, gather1
);
300 //////////////////////////////////////////////////////////////////////////
301 /// @brief Generate a masked gather operation in LLVM IR. If not
302 /// supported on the underlying platform, emulate it with loads
303 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
304 /// @param pBase - Int8* base VB address pointer value
305 /// @param vIndices - SIMD wide value of VB byte offsets
306 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
307 /// @param scale - value to scale indices by
308 Value
*Builder::GATHERPD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
)
312 // use avx2 gather instruction if available
313 if (JM()->mArch
.AVX2())
315 vMask
= BITCAST(S_EXT(vMask
, VectorType::get(mInt64Ty
, mVWidth
/ 2)), VectorType::get(mDoubleTy
, mVWidth
/ 2));
316 vGather
= VGATHERPD(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
320 Value
* pStack
= STACKSAVE();
322 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
323 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
324 STORE(vSrc
, vSrcPtr
);
326 vGather
= UndefValue::get(VectorType::get(mDoubleTy
, 4));
327 Value
*vScaleVec
= VECTOR_SPLAT(4, C((uint32_t)scale
));
328 Value
*vOffsets
= MUL(vIndices
, vScaleVec
);
329 for (uint32_t i
= 0; i
< mVWidth
/ 2; ++i
)
331 // single component byte index
332 Value
*offset
= VEXTRACT(vOffsets
, C(i
));
333 // byte pointer to component
334 Value
*loadAddress
= GEP(pBase
, offset
);
335 loadAddress
= BITCAST(loadAddress
, PointerType::get(mDoubleTy
, 0));
336 // pointer to the value to load if we're masking off a component
337 Value
*maskLoadAddress
= GEP(vSrcPtr
, { C(0), C(i
) });
338 Value
*selMask
= VEXTRACT(vMask
, C(i
));
339 // switch in a safe address to load if we're trying to access a vertex
340 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
341 Value
*val
= LOAD(validAddress
);
342 vGather
= VINSERT(vGather
, val
, C(i
));
344 STACKRESTORE(pStack
);
349 void Builder::Gather4(const SWR_FORMAT format
, Value
* pSrcBase
, Value
* byteOffsets
,
350 Value
* mask
, Value
* vGatherComponents
[], bool bPackedOutput
)
352 const SWR_FORMAT_INFO
&info
= GetFormatInfo(format
);
353 if (info
.type
[0] == SWR_TYPE_FLOAT
&& info
.bpc
[0] == 32)
355 GATHER4PS(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
);
359 GATHER4DD(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
);
363 void Builder::GATHER4PS(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
364 Value
* vMask
, Value
* vGatherComponents
[], bool bPackedOutput
)
366 switch (info
.bpp
/ info
.numComps
)
370 Value
* vGatherResult
[2];
372 // TODO: vGatherMaskedVal
373 Value
* vGatherMaskedVal
= VIMMED1((float)0);
375 // always have at least one component out of x or y to fetch
377 vGatherResult
[0] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
378 // e.g. result of first 8x32bit integer gather for 16bit components
379 // 256i - 0 1 2 3 4 5 6 7
380 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
383 // if we have at least one component out of x or y to fetch
384 if (info
.numComps
> 2)
386 // offset base to the next components(zw) in the vertex to gather
387 pSrcBase
= GEP(pSrcBase
, C((char)4));
389 vGatherResult
[1] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
390 // e.g. result of second 8x32bit integer gather for 16bit components
391 // 256i - 0 1 2 3 4 5 6 7
392 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
397 vGatherResult
[1] = vGatherMaskedVal
;
400 // Shuffle gathered components into place, each row is a component
401 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
407 for (uint32_t i
= 0; i
< 4; ++i
)
409 vGatherComponents
[i
] = VIMMED1(*(float*)&info
.defaults
[i
]);
412 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
414 uint32_t swizzleIndex
= info
.swizzle
[i
];
416 // Gather a SIMD of components
417 vGatherComponents
[swizzleIndex
] = GATHERPS(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
);
419 // offset base to the next component to gather
420 pSrcBase
= GEP(pSrcBase
, C((char)4));
425 SWR_INVALID("Invalid float format");
430 void Builder::GATHER4DD(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
431 Value
* vMask
, Value
* vGatherComponents
[], bool bPackedOutput
)
433 switch (info
.bpp
/ info
.numComps
)
437 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
438 Value
* vGatherResult
= GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
439 // e.g. result of an 8x32bit integer gather for 8bit components
440 // 256i - 0 1 2 3 4 5 6 7
441 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
443 Shuffle8bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
448 Value
* vGatherResult
[2];
450 // TODO: vGatherMaskedVal
451 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
453 // always have at least one component out of x or y to fetch
455 vGatherResult
[0] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
456 // e.g. result of first 8x32bit integer gather for 16bit components
457 // 256i - 0 1 2 3 4 5 6 7
458 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
461 // if we have at least one component out of x or y to fetch
462 if (info
.numComps
> 2)
464 // offset base to the next components(zw) in the vertex to gather
465 pSrcBase
= GEP(pSrcBase
, C((char)4));
467 vGatherResult
[1] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
468 // e.g. result of second 8x32bit integer gather for 16bit components
469 // 256i - 0 1 2 3 4 5 6 7
470 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
475 vGatherResult
[1] = vGatherMaskedVal
;
478 // Shuffle gathered components into place, each row is a component
479 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
486 for (uint32_t i
= 0; i
< 4; ++i
)
488 vGatherComponents
[i
] = VIMMED1((int)info
.defaults
[i
]);
491 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
493 uint32_t swizzleIndex
= info
.swizzle
[i
];
495 // Gather a SIMD of components
496 vGatherComponents
[swizzleIndex
] = GATHERDD(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
);
498 // offset base to the next component to gather
499 pSrcBase
= GEP(pSrcBase
, C((char)4));
504 SWR_INVALID("unsupported format");
509 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
[2], Value
* vGatherOutput
[4], bool bPackedOutput
)
512 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
513 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
515 // input could either be float or int vector; do shuffle work in int
516 vGatherInput
[0] = BITCAST(vGatherInput
[0], mSimdInt32Ty
);
517 vGatherInput
[1] = BITCAST(vGatherInput
[1], mSimdInt32Ty
);
521 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
524 Value
* vConstMask
= C
<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
525 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
526 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[0], v32x8Ty
), vConstMask
), vGatherTy
);
527 // after pshufb: group components together in each 128bit lane
528 // 256i - 0 1 2 3 4 5 6 7
529 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
531 Value
* vi128XY
= BITCAST(PERMD(vShufResult
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
532 // after PERMD: move and pack xy components into each 128bit lane
533 // 256i - 0 1 2 3 4 5 6 7
534 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
536 // do the same for zw components
537 Value
* vi128ZW
= nullptr;
538 if (info
.numComps
> 2)
540 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[1], v32x8Ty
), vConstMask
), vGatherTy
);
541 vi128ZW
= BITCAST(PERMD(vShufResult
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
544 for (uint32_t i
= 0; i
< 4; i
++)
546 uint32_t swizzleIndex
= info
.swizzle
[i
];
547 // todo: fixed for packed
548 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
549 if (i
>= info
.numComps
)
551 // set the default component val
552 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
556 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
557 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
558 // if x or y, use vi128XY permute result, else use vi128ZW
559 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
561 // extract packed component 128 bit lanes
562 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
568 // pshufb masks for each component
569 Value
* vConstMask
[2];
571 vConstMask
[0] = C
<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
572 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
575 vConstMask
[1] = C
<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
576 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
579 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
581 for (uint32_t i
= 0; i
< 4; ++i
)
583 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
586 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
588 uint32_t swizzleIndex
= info
.swizzle
[i
];
590 // select correct constMask for x/z or y/w pshufb
591 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
592 // if x or y, use vi128XY permute result, else use vi128ZW
593 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
595 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
[selectedGather
], v32x8Ty
), vConstMask
[selectedMask
]), vGatherTy
);
596 // after pshufb mask for x channel; z uses the same shuffle from the second gather
597 // 256i - 0 1 2 3 4 5 6 7
598 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
603 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
, Value
* vGatherOutput
[], bool bPackedOutput
)
606 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
607 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
611 Type
* v128Ty
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
613 Value
* vConstMask
= C
<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
614 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
615 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
616 // after pshufb: group components together in each 128bit lane
617 // 256i - 0 1 2 3 4 5 6 7
618 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
620 Value
* vi128XY
= BITCAST(PERMD(vShufResult
, C
<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty
);
621 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
622 // 256i - 0 1 2 3 4 5 6 7
623 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
625 // do the same for zw components
626 Value
* vi128ZW
= nullptr;
627 if (info
.numComps
> 2)
629 vi128ZW
= BITCAST(PERMD(vShufResult
, C
<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty
);
632 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
633 for (uint32_t i
= 0; i
< 4; i
++)
635 uint32_t swizzleIndex
= info
.swizzle
[i
];
636 // todo: fix for packed
637 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
638 if (i
>= info
.numComps
)
640 // set the default component val
641 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
645 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
646 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
647 // if x or y, use vi128XY permute result, else use vi128ZW
648 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
651 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
656 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
658 for (uint32_t i
= 0; i
< 4; ++i
)
660 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
663 for (uint32_t i
= 0; i
< info
.numComps
; i
++) {
664 uint32_t swizzleIndex
= info
.swizzle
[i
];
666 // pshufb masks for each component
672 vConstMask
= C
<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
673 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
677 vConstMask
= C
<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
678 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
682 vConstMask
= C
<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
683 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
687 vConstMask
= C
<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
688 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
691 vConstMask
= nullptr;
695 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
696 // after pshufb for x channel
697 // 256i - 0 1 2 3 4 5 6 7
698 // x000 x000 x000 x000 x000 x000 x000 x000
703 //////////////////////////////////////////////////////////////////////////
704 /// @brief emulates a scatter operation.
705 /// @param pDst - pointer to destination
706 /// @param vSrc - vector of src data to scatter
707 /// @param vOffsets - vector of byte offsets from pDst
708 /// @param vMask - mask of valid lanes
709 void Builder::SCATTERPS(Value
* pDst
, Value
* vSrc
, Value
* vOffsets
, Value
* vMask
)
713 while(Index = BitScanForward(mask))
714 srcElem = srcVector[Index]
715 offsetElem = offsetVector[Index]
716 *(pDst + offsetElem) = srcElem
717 Update mask (&= ~(1<<Index)
721 BasicBlock
* pCurBB
= IRB()->GetInsertBlock();
722 Function
* pFunc
= pCurBB
->getParent();
723 Type
* pSrcTy
= vSrc
->getType()->getVectorElementType();
725 // Store vectors on stack
726 if (pScatterStackSrc
== nullptr)
728 // Save off stack allocations and reuse per scatter. Significantly reduces stack
729 // requirements for shaders with a lot of scatters.
730 pScatterStackSrc
= CreateEntryAlloca(pFunc
, mSimdInt64Ty
);
731 pScatterStackOffsets
= CreateEntryAlloca(pFunc
, mSimdInt32Ty
);
734 Value
* pSrcArrayPtr
= BITCAST(pScatterStackSrc
, PointerType::get(vSrc
->getType(), 0));
735 Value
* pOffsetsArrayPtr
= pScatterStackOffsets
;
736 STORE(vSrc
, pSrcArrayPtr
);
737 STORE(vOffsets
, pOffsetsArrayPtr
);
739 // Cast to pointers for random access
740 pSrcArrayPtr
= POINTER_CAST(pSrcArrayPtr
, PointerType::get(pSrcTy
, 0));
741 pOffsetsArrayPtr
= POINTER_CAST(pOffsetsArrayPtr
, PointerType::get(mInt32Ty
, 0));
743 Value
* pMask
= VMOVMSKPS(BITCAST(vMask
, mSimdFP32Ty
));
746 Function
* pfnCttz
= Intrinsic::getDeclaration(mpJitMgr
->mpCurrentModule
, Intrinsic::cttz
, { mInt32Ty
});
748 // Setup loop basic block
749 BasicBlock
* pLoop
= BasicBlock::Create(mpJitMgr
->mContext
, "Scatter_Loop", pFunc
);
751 // compute first set bit
752 Value
* pIndex
= CALL(pfnCttz
, { pMask
, C(false) });
754 Value
* pIsUndef
= ICMP_EQ(pIndex
, C(32));
756 // Split current block
757 BasicBlock
* pPostLoop
= pCurBB
->splitBasicBlock(cast
<Instruction
>(pIsUndef
)->getNextNode());
759 // Remove unconditional jump created by splitBasicBlock
760 pCurBB
->getTerminator()->eraseFromParent();
762 // Add terminator to end of original block
763 IRB()->SetInsertPoint(pCurBB
);
765 // Add conditional branch
766 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
768 // Add loop basic block contents
769 IRB()->SetInsertPoint(pLoop
);
770 PHINode
* pIndexPhi
= PHI(mInt32Ty
, 2);
771 PHINode
* pMaskPhi
= PHI(mInt32Ty
, 2);
773 pIndexPhi
->addIncoming(pIndex
, pCurBB
);
774 pMaskPhi
->addIncoming(pMask
, pCurBB
);
776 // Extract elements for this index
777 Value
* pSrcElem
= LOADV(pSrcArrayPtr
, { pIndexPhi
});
778 Value
* pOffsetElem
= LOADV(pOffsetsArrayPtr
, { pIndexPhi
});
780 // GEP to this offset in dst
781 Value
* pCurDst
= GEP(pDst
, pOffsetElem
);
782 pCurDst
= POINTER_CAST(pCurDst
, PointerType::get(pSrcTy
, 0));
783 STORE(pSrcElem
, pCurDst
);
786 Value
* pNewMask
= AND(pMaskPhi
, NOT(SHL(C(1), pIndexPhi
)));
789 Value
* pNewIndex
= CALL(pfnCttz
, { pNewMask
, C(false) });
791 pIsUndef
= ICMP_EQ(pNewIndex
, C(32));
792 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
795 pIndexPhi
->addIncoming(pNewIndex
, pLoop
);
796 pMaskPhi
->addIncoming(pNewMask
, pLoop
);
798 // Move builder to beginning of post loop
799 IRB()->SetInsertPoint(pPostLoop
, pPostLoop
->begin());
802 //////////////////////////////////////////////////////////////////////////
803 /// @brief save/restore stack, providing ability to push/pop the stack and
804 /// reduce overall stack requirements for temporary stack use
805 Value
* Builder::STACKSAVE()
807 Function
* pfnStackSave
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::stacksave
);
808 return CALLA(pfnStackSave
);
811 void Builder::STACKRESTORE(Value
* pSaved
)
813 Function
* pfnStackRestore
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::stackrestore
);
814 CALL(pfnStackRestore
, std::initializer_list
<Value
*>{pSaved
});