1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file builder_misc.cpp
25 * @brief Implementation for miscellaneous builder functions
29 ******************************************************************************/
30 #include "jit_pch.hpp"
32 #include "common/rdtsc_buckets.h"
40 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
42 std::vector
<Value
*> indices
;
43 for (auto i
: indexList
)
45 return GEPA(ptr
, indices
);
48 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
50 std::vector
<Value
*> indices
;
51 for (auto i
: indexList
)
52 indices
.push_back(C(i
));
53 return GEPA(ptr
, indices
);
56 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
58 std::vector
<Value
*> indices
;
59 for (auto i
: indexList
)
61 return IN_BOUNDS_GEP(ptr
, indices
);
64 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
66 std::vector
<Value
*> indices
;
67 for (auto i
: indexList
)
68 indices
.push_back(C(i
));
69 return IN_BOUNDS_GEP(ptr
, indices
);
72 LoadInst
*Builder::LOAD(Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
, const llvm::Twine
& name
)
74 std::vector
<Value
*> valIndices
;
75 for (auto i
: indices
)
76 valIndices
.push_back(C(i
));
77 return LOAD(GEPA(basePtr
, valIndices
), name
);
80 LoadInst
*Builder::LOADV(Value
*basePtr
, const std::initializer_list
<Value
*> &indices
, const llvm::Twine
& name
)
82 std::vector
<Value
*> valIndices
;
83 for (auto i
: indices
)
84 valIndices
.push_back(i
);
85 return LOAD(GEPA(basePtr
, valIndices
), name
);
88 StoreInst
*Builder::STORE(Value
*val
, Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
)
90 std::vector
<Value
*> valIndices
;
91 for (auto i
: indices
)
92 valIndices
.push_back(C(i
));
93 return STORE(val
, GEPA(basePtr
, valIndices
));
96 StoreInst
*Builder::STOREV(Value
*val
, Value
*basePtr
, const std::initializer_list
<Value
*> &indices
)
98 std::vector
<Value
*> valIndices
;
99 for (auto i
: indices
)
100 valIndices
.push_back(i
);
101 return STORE(val
, GEPA(basePtr
, valIndices
));
104 //////////////////////////////////////////////////////////////////////////
105 /// @brief Generate an i32 masked load operation in LLVM IR. If not
106 /// supported on the underlying platform, emulate it with float masked load
107 /// @param src - base address pointer for the load
108 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
109 Value
*Builder::MASKLOADD(Value
* src
, Value
* mask
)
112 // use avx2 gather instruction is available
113 if (JM()->mArch
.AVX2())
115 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_avx2_maskload_d_256
);
116 vResult
= CALL(func
, { src
,mask
});
120 // maskload intrinsic expects integer mask operand in llvm >= 3.8
121 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
122 mask
= BITCAST(mask
, VectorType::get(mInt32Ty
, mVWidth
));
124 mask
= BITCAST(mask
, VectorType::get(mFP32Ty
, mVWidth
));
126 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_avx_maskload_ps_256
);
127 vResult
= BITCAST(CALL(func
, { src
,mask
}), VectorType::get(mInt32Ty
, mVWidth
));
132 //////////////////////////////////////////////////////////////////////////
133 /// @brief Generate a masked gather operation in LLVM IR. If not
134 /// supported on the underlying platform, emulate it with loads
135 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
136 /// @param pBase - Int8* base VB address pointer value
137 /// @param vIndices - SIMD wide value of VB byte offsets
138 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
139 /// @param scale - value to scale indices by
140 Value
*Builder::GATHERPS(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
)
144 // use avx2 gather instruction if available
145 if (JM()->mArch
.AVX2())
147 // force mask to <N x float>, required by vgather
148 Value
*mask
= BITCAST(VMASK(vMask
), mSimdFP32Ty
);
150 vGather
= VGATHERPS(vSrc
, pBase
, vIndices
, mask
, C(scale
));
154 Value
* pStack
= STACKSAVE();
156 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
157 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
158 STORE(vSrc
, vSrcPtr
);
160 vGather
= VUNDEF_F();
161 Value
*vScaleVec
= VIMMED1((uint32_t)scale
);
162 Value
*vOffsets
= MUL(vIndices
, vScaleVec
);
163 for (uint32_t i
= 0; i
< mVWidth
; ++i
)
165 // single component byte index
166 Value
*offset
= VEXTRACT(vOffsets
, C(i
));
167 // byte pointer to component
168 Value
*loadAddress
= GEP(pBase
, offset
);
169 loadAddress
= BITCAST(loadAddress
, PointerType::get(mFP32Ty
, 0));
170 // pointer to the value to load if we're masking off a component
171 Value
*maskLoadAddress
= GEP(vSrcPtr
, { C(0), C(i
) });
172 Value
*selMask
= VEXTRACT(vMask
, C(i
));
173 // switch in a safe address to load if we're trying to access a vertex
174 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
175 Value
*val
= LOAD(validAddress
);
176 vGather
= VINSERT(vGather
, val
, C(i
));
179 STACKRESTORE(pStack
);
185 Value
*Builder::GATHERPS_16(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
)
187 Value
*vGather
= VUNDEF_F_16();
189 // use AVX512F gather instruction if available
190 if (JM()->mArch
.AVX512F())
192 // force mask to <N-bit Integer>, required by vgather2
193 Value
*mask
= BITCAST(vMask
, mInt16Ty
);
195 vGather
= VGATHERPS_16(vSrc
, pBase
, vIndices
, mask
, C((uint32_t)scale
));
199 Value
*src0
= EXTRACT_16(vSrc
, 0);
200 Value
*src1
= EXTRACT_16(vSrc
, 1);
202 Value
*indices0
= EXTRACT_16(vIndices
, 0);
203 Value
*indices1
= EXTRACT_16(vIndices
, 1);
205 Value
*mask0
= EXTRACT_16(vMask
, 0);
206 Value
*mask1
= EXTRACT_16(vMask
, 1);
208 Value
*gather0
= GATHERPS(src0
, pBase
, indices0
, mask0
, scale
);
209 Value
*gather1
= GATHERPS(src1
, pBase
, indices1
, mask1
, scale
);
211 vGather
= JOIN_16(gather0
, gather1
);
217 //////////////////////////////////////////////////////////////////////////
218 /// @brief Generate a masked gather operation in LLVM IR. If not
219 /// supported on the underlying platform, emulate it with loads
220 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
221 /// @param pBase - Int8* base VB address pointer value
222 /// @param vIndices - SIMD wide value of VB byte offsets
223 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
224 /// @param scale - value to scale indices by
225 Value
*Builder::GATHERDD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
)
229 // use avx2 gather instruction if available
230 if (JM()->mArch
.AVX2())
232 vGather
= VGATHERDD(vSrc
, pBase
, vIndices
, VMASK(vMask
), C(scale
));
236 Value
* pStack
= STACKSAVE();
238 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
239 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
240 STORE(vSrc
, vSrcPtr
);
242 vGather
= VUNDEF_I();
243 Value
*vScaleVec
= VIMMED1((uint32_t)scale
);
244 Value
*vOffsets
= MUL(vIndices
, vScaleVec
);
245 for (uint32_t i
= 0; i
< mVWidth
; ++i
)
247 // single component byte index
248 Value
*offset
= VEXTRACT(vOffsets
, C(i
));
249 // byte pointer to component
250 Value
*loadAddress
= GEP(pBase
, offset
);
251 loadAddress
= BITCAST(loadAddress
, PointerType::get(mInt32Ty
, 0));
252 // pointer to the value to load if we're masking off a component
253 Value
*maskLoadAddress
= GEP(vSrcPtr
, { C(0), C(i
) });
254 Value
*selMask
= VEXTRACT(vMask
, C(i
));
255 // switch in a safe address to load if we're trying to access a vertex
256 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
257 Value
*val
= LOAD(validAddress
, C(0));
258 vGather
= VINSERT(vGather
, val
, C(i
));
261 STACKRESTORE(pStack
);
267 Value
*Builder::GATHERDD_16(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
)
269 Value
*vGather
= VUNDEF_I_16();
271 // use AVX512F gather instruction if available
272 if (JM()->mArch
.AVX512F())
274 // force mask to <N-bit Integer>, required by vgather2
275 Value
*mask
= BITCAST(vMask
, mInt16Ty
);
277 vGather
= VGATHERDD_16(vSrc
, pBase
, vIndices
, mask
, C((uint32_t)scale
));
281 Value
*src0
= EXTRACT_16(vSrc
, 0);
282 Value
*src1
= EXTRACT_16(vSrc
, 1);
284 Value
*indices0
= EXTRACT_16(vIndices
, 0);
285 Value
*indices1
= EXTRACT_16(vIndices
, 1);
287 Value
*mask0
= EXTRACT_16(vMask
, 0);
288 Value
*mask1
= EXTRACT_16(vMask
, 1);
290 Value
*gather0
= GATHERDD(src0
, pBase
, indices0
, mask0
, scale
);
291 Value
*gather1
= GATHERDD(src1
, pBase
, indices1
, mask1
, scale
);
293 vGather
= JOIN_16(gather0
, gather1
);
299 //////////////////////////////////////////////////////////////////////////
300 /// @brief Generate a masked gather operation in LLVM IR. If not
301 /// supported on the underlying platform, emulate it with loads
302 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
303 /// @param pBase - Int8* base VB address pointer value
304 /// @param vIndices - SIMD wide value of VB byte offsets
305 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
306 /// @param scale - value to scale indices by
307 Value
*Builder::GATHERPD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
)
311 // use avx2 gather instruction if available
312 if (JM()->mArch
.AVX2())
314 vMask
= BITCAST(S_EXT(vMask
, VectorType::get(mInt64Ty
, mVWidth
/ 2)), VectorType::get(mDoubleTy
, mVWidth
/ 2));
315 vGather
= VGATHERPD(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
319 Value
* pStack
= STACKSAVE();
321 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
322 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
323 STORE(vSrc
, vSrcPtr
);
325 vGather
= UndefValue::get(VectorType::get(mDoubleTy
, 4));
326 Value
*vScaleVec
= VECTOR_SPLAT(4, C((uint32_t)scale
));
327 Value
*vOffsets
= MUL(vIndices
, vScaleVec
);
328 for (uint32_t i
= 0; i
< mVWidth
/ 2; ++i
)
330 // single component byte index
331 Value
*offset
= VEXTRACT(vOffsets
, C(i
));
332 // byte pointer to component
333 Value
*loadAddress
= GEP(pBase
, offset
);
334 loadAddress
= BITCAST(loadAddress
, PointerType::get(mDoubleTy
, 0));
335 // pointer to the value to load if we're masking off a component
336 Value
*maskLoadAddress
= GEP(vSrcPtr
, { C(0), C(i
) });
337 Value
*selMask
= VEXTRACT(vMask
, C(i
));
338 // switch in a safe address to load if we're trying to access a vertex
339 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
340 Value
*val
= LOAD(validAddress
);
341 vGather
= VINSERT(vGather
, val
, C(i
));
343 STACKRESTORE(pStack
);
348 void Builder::Gather4(const SWR_FORMAT format
, Value
* pSrcBase
, Value
* byteOffsets
,
349 Value
* mask
, Value
* vGatherComponents
[], bool bPackedOutput
)
351 const SWR_FORMAT_INFO
&info
= GetFormatInfo(format
);
352 if (info
.type
[0] == SWR_TYPE_FLOAT
&& info
.bpc
[0] == 32)
354 GATHER4PS(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
);
358 GATHER4DD(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
);
362 void Builder::GATHER4PS(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
363 Value
* vMask
, Value
* vGatherComponents
[], bool bPackedOutput
)
365 switch (info
.bpp
/ info
.numComps
)
369 Value
* vGatherResult
[2];
371 // TODO: vGatherMaskedVal
372 Value
* vGatherMaskedVal
= VIMMED1((float)0);
374 // always have at least one component out of x or y to fetch
376 vGatherResult
[0] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
377 // e.g. result of first 8x32bit integer gather for 16bit components
378 // 256i - 0 1 2 3 4 5 6 7
379 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
382 // if we have at least one component out of x or y to fetch
383 if (info
.numComps
> 2)
385 // offset base to the next components(zw) in the vertex to gather
386 pSrcBase
= GEP(pSrcBase
, C((char)4));
388 vGatherResult
[1] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
389 // e.g. result of second 8x32bit integer gather for 16bit components
390 // 256i - 0 1 2 3 4 5 6 7
391 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
396 vGatherResult
[1] = vGatherMaskedVal
;
399 // Shuffle gathered components into place, each row is a component
400 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
406 for (uint32_t i
= 0; i
< 4; ++i
)
408 vGatherComponents
[i
] = VIMMED1(*(float*)&info
.defaults
[i
]);
411 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
413 uint32_t swizzleIndex
= info
.swizzle
[i
];
415 // Gather a SIMD of components
416 vGatherComponents
[swizzleIndex
] = GATHERPS(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
);
418 // offset base to the next component to gather
419 pSrcBase
= GEP(pSrcBase
, C((char)4));
424 SWR_INVALID("Invalid float format");
429 void Builder::GATHER4DD(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
430 Value
* vMask
, Value
* vGatherComponents
[], bool bPackedOutput
)
432 switch (info
.bpp
/ info
.numComps
)
436 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
437 Value
* vGatherResult
= GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
438 // e.g. result of an 8x32bit integer gather for 8bit components
439 // 256i - 0 1 2 3 4 5 6 7
440 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
442 Shuffle8bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
447 Value
* vGatherResult
[2];
449 // TODO: vGatherMaskedVal
450 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
452 // always have at least one component out of x or y to fetch
454 vGatherResult
[0] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
455 // e.g. result of first 8x32bit integer gather for 16bit components
456 // 256i - 0 1 2 3 4 5 6 7
457 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
460 // if we have at least one component out of x or y to fetch
461 if (info
.numComps
> 2)
463 // offset base to the next components(zw) in the vertex to gather
464 pSrcBase
= GEP(pSrcBase
, C((char)4));
466 vGatherResult
[1] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
467 // e.g. result of second 8x32bit integer gather for 16bit components
468 // 256i - 0 1 2 3 4 5 6 7
469 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
474 vGatherResult
[1] = vGatherMaskedVal
;
477 // Shuffle gathered components into place, each row is a component
478 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
485 for (uint32_t i
= 0; i
< 4; ++i
)
487 vGatherComponents
[i
] = VIMMED1((int)info
.defaults
[i
]);
490 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
492 uint32_t swizzleIndex
= info
.swizzle
[i
];
494 // Gather a SIMD of components
495 vGatherComponents
[swizzleIndex
] = GATHERDD(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
);
497 // offset base to the next component to gather
498 pSrcBase
= GEP(pSrcBase
, C((char)4));
503 SWR_INVALID("unsupported format");
508 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
[2], Value
* vGatherOutput
[4], bool bPackedOutput
)
511 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
512 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
514 // input could either be float or int vector; do shuffle work in int
515 vGatherInput
[0] = BITCAST(vGatherInput
[0], mSimdInt32Ty
);
516 vGatherInput
[1] = BITCAST(vGatherInput
[1], mSimdInt32Ty
);
520 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
523 Value
* vConstMask
= C
<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
524 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
525 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[0], v32x8Ty
), vConstMask
), vGatherTy
);
526 // after pshufb: group components together in each 128bit lane
527 // 256i - 0 1 2 3 4 5 6 7
528 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
530 Value
* vi128XY
= BITCAST(PERMD(vShufResult
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
531 // after PERMD: move and pack xy components into each 128bit lane
532 // 256i - 0 1 2 3 4 5 6 7
533 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
535 // do the same for zw components
536 Value
* vi128ZW
= nullptr;
537 if (info
.numComps
> 2)
539 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[1], v32x8Ty
), vConstMask
), vGatherTy
);
540 vi128ZW
= BITCAST(PERMD(vShufResult
, C
<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy
);
543 for (uint32_t i
= 0; i
< 4; i
++)
545 uint32_t swizzleIndex
= info
.swizzle
[i
];
546 // todo: fixed for packed
547 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
548 if (i
>= info
.numComps
)
550 // set the default component val
551 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
555 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
556 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
557 // if x or y, use vi128XY permute result, else use vi128ZW
558 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
560 // extract packed component 128 bit lanes
561 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
567 // pshufb masks for each component
568 Value
* vConstMask
[2];
570 vConstMask
[0] = C
<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
571 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
574 vConstMask
[1] = C
<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
575 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
578 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
580 for (uint32_t i
= 0; i
< 4; ++i
)
582 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
585 for (uint32_t i
= 0; i
< info
.numComps
; i
++)
587 uint32_t swizzleIndex
= info
.swizzle
[i
];
589 // select correct constMask for x/z or y/w pshufb
590 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
591 // if x or y, use vi128XY permute result, else use vi128ZW
592 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
594 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
[selectedGather
], v32x8Ty
), vConstMask
[selectedMask
]), vGatherTy
);
595 // after pshufb mask for x channel; z uses the same shuffle from the second gather
596 // 256i - 0 1 2 3 4 5 6 7
597 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
602 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
, Value
* vGatherOutput
[], bool bPackedOutput
)
605 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
606 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
610 Type
* v128Ty
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
612 Value
* vConstMask
= C
<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
613 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
614 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
615 // after pshufb: group components together in each 128bit lane
616 // 256i - 0 1 2 3 4 5 6 7
617 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
619 Value
* vi128XY
= BITCAST(PERMD(vShufResult
, C
<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty
);
620 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
621 // 256i - 0 1 2 3 4 5 6 7
622 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
624 // do the same for zw components
625 Value
* vi128ZW
= nullptr;
626 if (info
.numComps
> 2)
628 vi128ZW
= BITCAST(PERMD(vShufResult
, C
<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty
);
631 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
632 for (uint32_t i
= 0; i
< 4; i
++)
634 uint32_t swizzleIndex
= info
.swizzle
[i
];
635 // todo: fix for packed
636 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
637 if (i
>= info
.numComps
)
639 // set the default component val
640 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
644 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
645 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
646 // if x or y, use vi128XY permute result, else use vi128ZW
647 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
650 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
655 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
657 for (uint32_t i
= 0; i
< 4; ++i
)
659 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
662 for (uint32_t i
= 0; i
< info
.numComps
; i
++) {
663 uint32_t swizzleIndex
= info
.swizzle
[i
];
665 // pshufb masks for each component
671 vConstMask
= C
<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
672 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
676 vConstMask
= C
<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
677 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
681 vConstMask
= C
<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
682 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
686 vConstMask
= C
<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
687 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
690 vConstMask
= nullptr;
694 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
695 // after pshufb for x channel
696 // 256i - 0 1 2 3 4 5 6 7
697 // x000 x000 x000 x000 x000 x000 x000 x000
702 //////////////////////////////////////////////////////////////////////////
703 /// @brief emulates a scatter operation.
704 /// @param pDst - pointer to destination
705 /// @param vSrc - vector of src data to scatter
706 /// @param vOffsets - vector of byte offsets from pDst
707 /// @param vMask - mask of valid lanes
708 void Builder::SCATTERPS(Value
* pDst
, Value
* vSrc
, Value
* vOffsets
, Value
* vMask
)
712 while(Index = BitScanForward(mask))
713 srcElem = srcVector[Index]
714 offsetElem = offsetVector[Index]
715 *(pDst + offsetElem) = srcElem
716 Update mask (&= ~(1<<Index)
720 BasicBlock
* pCurBB
= IRB()->GetInsertBlock();
721 Function
* pFunc
= pCurBB
->getParent();
722 Type
* pSrcTy
= vSrc
->getType()->getVectorElementType();
724 // Store vectors on stack
725 if (pScatterStackSrc
== nullptr)
727 // Save off stack allocations and reuse per scatter. Significantly reduces stack
728 // requirements for shaders with a lot of scatters.
729 pScatterStackSrc
= CreateEntryAlloca(pFunc
, mSimdInt64Ty
);
730 pScatterStackOffsets
= CreateEntryAlloca(pFunc
, mSimdInt32Ty
);
733 Value
* pSrcArrayPtr
= BITCAST(pScatterStackSrc
, PointerType::get(vSrc
->getType(), 0));
734 Value
* pOffsetsArrayPtr
= pScatterStackOffsets
;
735 STORE(vSrc
, pSrcArrayPtr
);
736 STORE(vOffsets
, pOffsetsArrayPtr
);
738 // Cast to pointers for random access
739 pSrcArrayPtr
= POINTER_CAST(pSrcArrayPtr
, PointerType::get(pSrcTy
, 0));
740 pOffsetsArrayPtr
= POINTER_CAST(pOffsetsArrayPtr
, PointerType::get(mInt32Ty
, 0));
742 Value
* pMask
= VMOVMSKPS(BITCAST(vMask
, mSimdFP32Ty
));
745 Function
* pfnCttz
= Intrinsic::getDeclaration(mpJitMgr
->mpCurrentModule
, Intrinsic::cttz
, { mInt32Ty
});
747 // Setup loop basic block
748 BasicBlock
* pLoop
= BasicBlock::Create(mpJitMgr
->mContext
, "Scatter_Loop", pFunc
);
750 // compute first set bit
751 Value
* pIndex
= CALL(pfnCttz
, { pMask
, C(false) });
753 Value
* pIsUndef
= ICMP_EQ(pIndex
, C(32));
755 // Split current block
756 BasicBlock
* pPostLoop
= pCurBB
->splitBasicBlock(cast
<Instruction
>(pIsUndef
)->getNextNode());
758 // Remove unconditional jump created by splitBasicBlock
759 pCurBB
->getTerminator()->eraseFromParent();
761 // Add terminator to end of original block
762 IRB()->SetInsertPoint(pCurBB
);
764 // Add conditional branch
765 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
767 // Add loop basic block contents
768 IRB()->SetInsertPoint(pLoop
);
769 PHINode
* pIndexPhi
= PHI(mInt32Ty
, 2);
770 PHINode
* pMaskPhi
= PHI(mInt32Ty
, 2);
772 pIndexPhi
->addIncoming(pIndex
, pCurBB
);
773 pMaskPhi
->addIncoming(pMask
, pCurBB
);
775 // Extract elements for this index
776 Value
* pSrcElem
= LOADV(pSrcArrayPtr
, { pIndexPhi
});
777 Value
* pOffsetElem
= LOADV(pOffsetsArrayPtr
, { pIndexPhi
});
779 // GEP to this offset in dst
780 Value
* pCurDst
= GEP(pDst
, pOffsetElem
);
781 pCurDst
= POINTER_CAST(pCurDst
, PointerType::get(pSrcTy
, 0));
782 STORE(pSrcElem
, pCurDst
);
785 Value
* pNewMask
= AND(pMaskPhi
, NOT(SHL(C(1), pIndexPhi
)));
788 Value
* pNewIndex
= CALL(pfnCttz
, { pNewMask
, C(false) });
790 pIsUndef
= ICMP_EQ(pNewIndex
, C(32));
791 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
794 pIndexPhi
->addIncoming(pNewIndex
, pLoop
);
795 pMaskPhi
->addIncoming(pNewMask
, pLoop
);
797 // Move builder to beginning of post loop
798 IRB()->SetInsertPoint(pPostLoop
, pPostLoop
->begin());
801 //////////////////////////////////////////////////////////////////////////
802 /// @brief save/restore stack, providing ability to push/pop the stack and
803 /// reduce overall stack requirements for temporary stack use
804 Value
* Builder::STACKSAVE()
806 Function
* pfnStackSave
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::stacksave
);
807 return CALLA(pfnStackSave
);
810 void Builder::STACKRESTORE(Value
* pSaved
)
812 Function
* pfnStackRestore
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::stackrestore
);
813 CALL(pfnStackRestore
, std::initializer_list
<Value
*>{pSaved
});