1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file builder_misc.cpp
25 * @brief Implementation for miscellaneous builder functions
29 ******************************************************************************/
31 #include "common/rdtsc_buckets.h"
37 void __cdecl
CallPrint(const char* fmt
, ...);
39 //////////////////////////////////////////////////////////////////////////
40 /// @brief Convert an IEEE 754 32-bit single precision float to an
41 /// 16 bit float with 5 exponent bits and a variable
42 /// number of mantissa bits.
43 /// @param val - 32-bit float
44 /// @todo Maybe move this outside of this file into a header?
45 static uint16_t Convert32To16Float(float val
)
47 uint32_t sign
, exp
, mant
;
50 // Extract the sign, exponent, and mantissa
51 uint32_t uf
= *(uint32_t*)&val
;
52 sign
= (uf
& 0x80000000) >> 31;
53 exp
= (uf
& 0x7F800000) >> 23;
54 mant
= uf
& 0x007FFFFF;
56 // Check for out of range
61 sign
= 1; // set the sign bit for NANs
63 else if (std::isinf(val
))
68 else if (exp
> (0x70 + 0x1E)) // Too big to represent -> max representable value
73 else if ((exp
<= 0x70) && (exp
>= 0x66)) // It's a denorm
76 for (; exp
<= 0x70; mant
>>= 1, exp
++)
81 else if (exp
< 0x66) // Too small to represent -> Zero
88 // Saves bits that will be shifted off for rounding
89 roundBits
= mant
& 0x1FFFu
;
90 // convert exponent and mantissa to 16 bit format
94 // Essentially RTZ, but round up if off by only 1 lsb
95 if (roundBits
== 0x1FFFu
)
99 if ((mant
& 0xC00u
) != 0)
101 // make sure only the needed bits are used
106 uint32_t tmpVal
= (sign
<< 15) | (exp
<< 10) | mant
;
107 return (uint16_t)tmpVal
;
110 //////////////////////////////////////////////////////////////////////////
111 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
113 /// @param val - 16-bit float
114 /// @todo Maybe move this outside of this file into a header?
115 static float ConvertSmallFloatTo32(uint32_t val
)
118 if ((val
& 0x7fff) == 0)
120 result
= ((uint32_t)(val
& 0x8000)) << 16;
122 else if ((val
& 0x7c00) == 0x7c00)
124 result
= ((val
& 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
125 result
|= ((uint32_t)val
& 0x8000) << 16;
129 uint32_t sign
= (val
& 0x8000) << 16;
130 uint32_t mant
= (val
& 0x3ff) << 13;
131 uint32_t exp
= (val
>> 10) & 0x1f;
132 if ((exp
== 0) && (mant
!= 0)) // Adjust exponent and mantissa for denormals
135 while (mant
< (0x400 << 13))
140 mant
&= (0x3ff << 13);
142 exp
= ((exp
- 15 + 127) & 0xff) << 23;
143 result
= sign
| exp
| mant
;
146 return *(float*)&result
;
149 Constant
*Builder::C(bool i
)
151 return ConstantInt::get(IRB()->getInt1Ty(), (i
? 1 : 0));
154 Constant
*Builder::C(char i
)
156 return ConstantInt::get(IRB()->getInt8Ty(), i
);
159 Constant
*Builder::C(uint8_t i
)
161 return ConstantInt::get(IRB()->getInt8Ty(), i
);
164 Constant
*Builder::C(int i
)
166 return ConstantInt::get(IRB()->getInt32Ty(), i
);
169 Constant
*Builder::C(int64_t i
)
171 return ConstantInt::get(IRB()->getInt64Ty(), i
);
174 Constant
*Builder::C(uint16_t i
)
176 return ConstantInt::get(mInt16Ty
,i
);
179 Constant
*Builder::C(uint32_t i
)
181 return ConstantInt::get(IRB()->getInt32Ty(), i
);
184 Constant
*Builder::C(float i
)
186 return ConstantFP::get(IRB()->getFloatTy(), i
);
189 Constant
*Builder::PRED(bool pred
)
191 return ConstantInt::get(IRB()->getInt1Ty(), (pred
? 1 : 0));
194 Value
*Builder::VIMMED1(int i
)
196 return ConstantVector::getSplat(mVWidth
, cast
<ConstantInt
>(C(i
)));
199 Value
*Builder::VIMMED1(uint32_t i
)
201 return ConstantVector::getSplat(mVWidth
, cast
<ConstantInt
>(C(i
)));
204 Value
*Builder::VIMMED1(float i
)
206 return ConstantVector::getSplat(mVWidth
, cast
<ConstantFP
>(C(i
)));
209 Value
*Builder::VIMMED1(bool i
)
211 return ConstantVector::getSplat(mVWidth
, cast
<ConstantInt
>(C(i
)));
214 Value
*Builder::VUNDEF_IPTR()
216 return UndefValue::get(VectorType::get(mInt32PtrTy
,mVWidth
));
219 Value
*Builder::VUNDEF_I()
221 return UndefValue::get(VectorType::get(mInt32Ty
, mVWidth
));
224 Value
*Builder::VUNDEF(Type
*ty
, uint32_t size
)
226 return UndefValue::get(VectorType::get(ty
, size
));
229 Value
*Builder::VUNDEF_F()
231 return UndefValue::get(VectorType::get(mFP32Ty
, mVWidth
));
234 Value
*Builder::VUNDEF(Type
* t
)
236 return UndefValue::get(VectorType::get(t
, mVWidth
));
239 Value
*Builder::VBROADCAST(Value
*src
)
241 // check if src is already a vector
242 if (src
->getType()->isVectorTy())
247 return VECTOR_SPLAT(mVWidth
, src
);
250 uint32_t Builder::IMMED(Value
* v
)
252 SWR_ASSERT(isa
<ConstantInt
>(v
));
253 ConstantInt
*pValConst
= cast
<ConstantInt
>(v
);
254 return pValConst
->getZExtValue();
257 int32_t Builder::S_IMMED(Value
* v
)
259 SWR_ASSERT(isa
<ConstantInt
>(v
));
260 ConstantInt
*pValConst
= cast
<ConstantInt
>(v
);
261 return pValConst
->getSExtValue();
264 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
266 std::vector
<Value
*> indices
;
267 for (auto i
: indexList
)
268 indices
.push_back(i
);
269 return GEPA(ptr
, indices
);
272 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
274 std::vector
<Value
*> indices
;
275 for (auto i
: indexList
)
276 indices
.push_back(C(i
));
277 return GEPA(ptr
, indices
);
280 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
282 std::vector
<Value
*> indices
;
283 for (auto i
: indexList
)
284 indices
.push_back(i
);
285 return IN_BOUNDS_GEP(ptr
, indices
);
288 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
290 std::vector
<Value
*> indices
;
291 for (auto i
: indexList
)
292 indices
.push_back(C(i
));
293 return IN_BOUNDS_GEP(ptr
, indices
);
296 LoadInst
*Builder::LOAD(Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
, const llvm::Twine
& name
)
298 std::vector
<Value
*> valIndices
;
299 for (auto i
: indices
)
300 valIndices
.push_back(C(i
));
301 return LOAD(GEPA(basePtr
, valIndices
), name
);
304 LoadInst
*Builder::LOADV(Value
*basePtr
, const std::initializer_list
<Value
*> &indices
, const llvm::Twine
& name
)
306 std::vector
<Value
*> valIndices
;
307 for (auto i
: indices
)
308 valIndices
.push_back(i
);
309 return LOAD(GEPA(basePtr
, valIndices
), name
);
312 StoreInst
*Builder::STORE(Value
*val
, Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
)
314 std::vector
<Value
*> valIndices
;
315 for (auto i
: indices
)
316 valIndices
.push_back(C(i
));
317 return STORE(val
, GEPA(basePtr
, valIndices
));
320 StoreInst
*Builder::STOREV(Value
*val
, Value
*basePtr
, const std::initializer_list
<Value
*> &indices
)
322 std::vector
<Value
*> valIndices
;
323 for (auto i
: indices
)
324 valIndices
.push_back(i
);
325 return STORE(val
, GEPA(basePtr
, valIndices
));
328 CallInst
*Builder::CALL(Value
*Callee
, const std::initializer_list
<Value
*> &argsList
)
330 std::vector
<Value
*> args
;
331 for (auto arg
: argsList
)
333 return CALLA(Callee
, args
);
336 CallInst
*Builder::CALL(Value
*Callee
, Value
* arg
)
338 std::vector
<Value
*> args
;
340 return CALLA(Callee
, args
);
343 CallInst
*Builder::CALL2(Value
*Callee
, Value
* arg1
, Value
* arg2
)
345 std::vector
<Value
*> args
;
346 args
.push_back(arg1
);
347 args
.push_back(arg2
);
348 return CALLA(Callee
, args
);
351 CallInst
*Builder::CALL3(Value
*Callee
, Value
* arg1
, Value
* arg2
, Value
* arg3
)
353 std::vector
<Value
*> args
;
354 args
.push_back(arg1
);
355 args
.push_back(arg2
);
356 args
.push_back(arg3
);
357 return CALLA(Callee
, args
);
360 //////////////////////////////////////////////////////////////////////////
361 Value
*Builder::DEBUGTRAP()
363 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::debugtrap
);
367 Value
*Builder::VRCP(Value
*va
)
369 return FDIV(VIMMED1(1.0f
), va
); // 1 / a
372 Value
*Builder::VPLANEPS(Value
* vA
, Value
* vB
, Value
* vC
, Value
* &vX
, Value
* &vY
)
374 Value
* vOut
= FMADDPS(vA
, vX
, vC
);
375 vOut
= FMADDPS(vB
, vY
, vOut
);
379 //////////////////////////////////////////////////////////////////////////
380 /// @brief Generate an i32 masked load operation in LLVM IR. If not
381 /// supported on the underlying platform, emulate it with float masked load
382 /// @param src - base address pointer for the load
383 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
384 Value
*Builder::MASKLOADD(Value
* src
,Value
* mask
)
387 // use avx2 gather instruction is available
388 if(JM()->mArch
.AVX2())
390 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_avx2_maskload_d_256
);
391 vResult
= CALL(func
,{src
,mask
});
395 // maskload intrinsic expects integer mask operand in llvm >= 3.8
396 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
397 mask
= BITCAST(mask
,VectorType::get(mInt32Ty
,mVWidth
));
399 mask
= BITCAST(mask
,VectorType::get(mFP32Ty
,mVWidth
));
401 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
,Intrinsic::x86_avx_maskload_ps_256
);
402 vResult
= BITCAST(CALL(func
,{src
,mask
}), VectorType::get(mInt32Ty
,mVWidth
));
407 //////////////////////////////////////////////////////////////////////////
408 /// @brief insert a JIT call to CallPrint
409 /// - outputs formatted string to both stdout and VS output window
410 /// - DEBUG builds only
412 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
413 /// where C(lane) creates a constant value to print, and pIndex is the Value*
414 /// result from a GEP, printing out the pointer to memory
415 /// @param printStr - constant string to print, which includes format specifiers
416 /// @param printArgs - initializer list of Value*'s to print to std out
417 CallInst
*Builder::PRINT(const std::string
&printStr
,const std::initializer_list
<Value
*> &printArgs
)
419 // push the arguments to CallPrint into a vector
420 std::vector
<Value
*> printCallArgs
;
421 // save room for the format string. we still need to modify it for vectors
422 printCallArgs
.resize(1);
424 // search through the format string for special processing
426 std::string
tempStr(printStr
);
427 pos
= tempStr
.find('%', pos
);
428 auto v
= printArgs
.begin();
430 while ((pos
!= std::string::npos
) && (v
!= printArgs
.end()))
433 Type
* pType
= pArg
->getType();
435 if (pType
->isVectorTy())
437 Type
* pContainedType
= pType
->getContainedType(0);
439 if (toupper(tempStr
[pos
+ 1]) == 'X')
442 tempStr
[pos
+ 1] = 'x';
443 tempStr
.insert(pos
+ 2, "%08X ");
446 printCallArgs
.push_back(VEXTRACT(pArg
, C(0)));
448 std::string vectorFormatStr
;
449 for (uint32_t i
= 1; i
< pType
->getVectorNumElements(); ++i
)
451 vectorFormatStr
+= "0x%08X ";
452 printCallArgs
.push_back(VEXTRACT(pArg
, C(i
)));
455 tempStr
.insert(pos
, vectorFormatStr
);
456 pos
+= vectorFormatStr
.size();
458 else if ((tempStr
[pos
+ 1] == 'f') && (pContainedType
->isFloatTy()))
461 for (; i
< (pArg
->getType()->getVectorNumElements()) - 1; i
++)
463 tempStr
.insert(pos
, std::string("%f "));
465 printCallArgs
.push_back(FP_EXT(VEXTRACT(pArg
, C(i
)), Type::getDoubleTy(JM()->mContext
)));
467 printCallArgs
.push_back(FP_EXT(VEXTRACT(pArg
, C(i
)), Type::getDoubleTy(JM()->mContext
)));
469 else if ((tempStr
[pos
+ 1] == 'd') && (pContainedType
->isIntegerTy()))
472 for (; i
< (pArg
->getType()->getVectorNumElements()) - 1; i
++)
474 tempStr
.insert(pos
, std::string("%d "));
476 printCallArgs
.push_back(VEXTRACT(pArg
, C(i
)));
478 printCallArgs
.push_back(VEXTRACT(pArg
, C(i
)));
483 if (toupper(tempStr
[pos
+ 1]) == 'X')
486 tempStr
.insert(pos
+ 1, "x%08");
487 printCallArgs
.push_back(pArg
);
490 // for %f we need to cast float Values to doubles so that they print out correctly
491 else if ((tempStr
[pos
+ 1] == 'f') && (pType
->isFloatTy()))
493 printCallArgs
.push_back(FP_EXT(pArg
, Type::getDoubleTy(JM()->mContext
)));
498 printCallArgs
.push_back(pArg
);
502 // advance to the next arguement
504 pos
= tempStr
.find('%', ++pos
);
507 // create global variable constant string
508 Constant
*constString
= ConstantDataArray::getString(JM()->mContext
,tempStr
,true);
509 GlobalVariable
*gvPtr
= new GlobalVariable(constString
->getType(),true,GlobalValue::InternalLinkage
,constString
,"printStr");
510 JM()->mpCurrentModule
->getGlobalList().push_back(gvPtr
);
512 // get a pointer to the first character in the constant string array
513 std::vector
<Constant
*> geplist
{C(0),C(0)};
514 Constant
*strGEP
= ConstantExpr::getGetElementPtr(nullptr, gvPtr
,geplist
,false);
516 // insert the pointer to the format string in the argument vector
517 printCallArgs
[0] = strGEP
;
519 // get pointer to CallPrint function and insert decl into the module if needed
520 std::vector
<Type
*> args
;
521 args
.push_back(PointerType::get(mInt8Ty
,0));
522 FunctionType
* callPrintTy
= FunctionType::get(Type::getVoidTy(JM()->mContext
),args
,true);
523 Function
*callPrintFn
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("CallPrint", callPrintTy
));
525 // if we haven't yet added the symbol to the symbol table
526 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
528 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint
);
531 // insert a call to CallPrint
532 return CALLA(callPrintFn
,printCallArgs
);
535 //////////////////////////////////////////////////////////////////////////
536 /// @brief Wrapper around PRINT with initializer list.
537 CallInst
* Builder::PRINT(const std::string
&printStr
)
539 return PRINT(printStr
, {});
542 //////////////////////////////////////////////////////////////////////////
543 /// @brief Generate a masked gather operation in LLVM IR. If not
544 /// supported on the underlying platform, emulate it with loads
545 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
546 /// @param pBase - Int8* base VB address pointer value
547 /// @param vIndices - SIMD wide value of VB byte offsets
548 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
549 /// @param scale - value to scale indices by
550 Value
*Builder::GATHERPS(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, Value
* scale
)
554 // use avx2 gather instruction if available
555 if(JM()->mArch
.AVX2())
557 // force mask to <N x float>, required by vgather
558 vMask
= BITCAST(vMask
, mSimdFP32Ty
);
559 vGather
= VGATHERPS(vSrc
,pBase
,vIndices
,vMask
,scale
);
563 Value
* pStack
= STACKSAVE();
565 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
566 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
567 STORE(vSrc
, vSrcPtr
);
569 vGather
= VUNDEF_F();
570 Value
*vScaleVec
= VBROADCAST(Z_EXT(scale
,mInt32Ty
));
571 Value
*vOffsets
= MUL(vIndices
,vScaleVec
);
572 Value
*mask
= MASK(vMask
);
573 for(uint32_t i
= 0; i
< mVWidth
; ++i
)
575 // single component byte index
576 Value
*offset
= VEXTRACT(vOffsets
,C(i
));
577 // byte pointer to component
578 Value
*loadAddress
= GEP(pBase
,offset
);
579 loadAddress
= BITCAST(loadAddress
,PointerType::get(mFP32Ty
,0));
580 // pointer to the value to load if we're masking off a component
581 Value
*maskLoadAddress
= GEP(vSrcPtr
,{C(0), C(i
)});
582 Value
*selMask
= VEXTRACT(mask
,C(i
));
583 // switch in a safe address to load if we're trying to access a vertex
584 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
585 Value
*val
= LOAD(validAddress
);
586 vGather
= VINSERT(vGather
,val
,C(i
));
588 STACKRESTORE(pStack
);
594 //////////////////////////////////////////////////////////////////////////
595 /// @brief Generate a masked gather operation in LLVM IR. If not
596 /// supported on the underlying platform, emulate it with loads
597 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
598 /// @param pBase - Int8* base VB address pointer value
599 /// @param vIndices - SIMD wide value of VB byte offsets
600 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
601 /// @param scale - value to scale indices by
602 Value
*Builder::GATHERDD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, Value
* scale
)
606 // use avx2 gather instruction if available
607 if(JM()->mArch
.AVX2())
609 vGather
= VGATHERDD(vSrc
, pBase
, vIndices
, vMask
, scale
);
613 Value
* pStack
= STACKSAVE();
615 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
616 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
617 STORE(vSrc
, vSrcPtr
);
619 vGather
= VUNDEF_I();
620 Value
*vScaleVec
= VBROADCAST(Z_EXT(scale
, mInt32Ty
));
621 Value
*vOffsets
= MUL(vIndices
, vScaleVec
);
622 Value
*mask
= MASK(vMask
);
623 for(uint32_t i
= 0; i
< mVWidth
; ++i
)
625 // single component byte index
626 Value
*offset
= VEXTRACT(vOffsets
, C(i
));
627 // byte pointer to component
628 Value
*loadAddress
= GEP(pBase
, offset
);
629 loadAddress
= BITCAST(loadAddress
, PointerType::get(mInt32Ty
, 0));
630 // pointer to the value to load if we're masking off a component
631 Value
*maskLoadAddress
= GEP(vSrcPtr
, {C(0), C(i
)});
632 Value
*selMask
= VEXTRACT(mask
, C(i
));
633 // switch in a safe address to load if we're trying to access a vertex
634 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
635 Value
*val
= LOAD(validAddress
, C(0));
636 vGather
= VINSERT(vGather
, val
, C(i
));
639 STACKRESTORE(pStack
);
644 //////////////////////////////////////////////////////////////////////////
645 /// @brief Generate a masked gather operation in LLVM IR. If not
646 /// supported on the underlying platform, emulate it with loads
647 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
648 /// @param pBase - Int8* base VB address pointer value
649 /// @param vIndices - SIMD wide value of VB byte offsets
650 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
651 /// @param scale - value to scale indices by
652 Value
*Builder::GATHERPD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, Value
* scale
)
656 // use avx2 gather instruction if available
657 if(JM()->mArch
.AVX2())
659 vGather
= VGATHERPD(vSrc
, pBase
, vIndices
, vMask
, scale
);
663 Value
* pStack
= STACKSAVE();
665 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
666 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
667 STORE(vSrc
, vSrcPtr
);
669 vGather
= UndefValue::get(VectorType::get(mDoubleTy
, 4));
670 Value
*vScaleVec
= VECTOR_SPLAT(4, Z_EXT(scale
,mInt32Ty
));
671 Value
*vOffsets
= MUL(vIndices
,vScaleVec
);
672 Value
*mask
= MASK(vMask
);
673 for(uint32_t i
= 0; i
< mVWidth
/2; ++i
)
675 // single component byte index
676 Value
*offset
= VEXTRACT(vOffsets
,C(i
));
677 // byte pointer to component
678 Value
*loadAddress
= GEP(pBase
,offset
);
679 loadAddress
= BITCAST(loadAddress
,PointerType::get(mDoubleTy
,0));
680 // pointer to the value to load if we're masking off a component
681 Value
*maskLoadAddress
= GEP(vSrcPtr
,{C(0), C(i
)});
682 Value
*selMask
= VEXTRACT(mask
,C(i
));
683 // switch in a safe address to load if we're trying to access a vertex
684 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
685 Value
*val
= LOAD(validAddress
);
686 vGather
= VINSERT(vGather
,val
,C(i
));
688 STACKRESTORE(pStack
);
693 //////////////////////////////////////////////////////////////////////////
694 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
695 Value
* Builder::MASK(Value
* vmask
)
697 Value
* src
= BITCAST(vmask
, mSimdInt32Ty
);
698 return ICMP_SLT(src
, VIMMED1(0));
701 //////////////////////////////////////////////////////////////////////////
702 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
703 Value
* Builder::VMASK(Value
* mask
)
705 return S_EXT(mask
, mSimdInt32Ty
);
708 //////////////////////////////////////////////////////////////////////////
709 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
710 /// supported on the underlying platform, emulate it
711 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
712 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
713 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
714 /// 128bits of a, and vice versa for the upper lanes. If the mask
715 /// value is negative, '0' is inserted.
716 Value
*Builder::PSHUFB(Value
* a
, Value
* b
)
719 // use avx2 pshufb instruction if available
720 if(JM()->mArch
.AVX2())
726 Constant
* cB
= dyn_cast
<Constant
>(b
);
727 // number of 8 bit elements in b
728 uint32_t numElms
= cast
<VectorType
>(cB
->getType())->getNumElements();
730 Value
* vShuf
= UndefValue::get(VectorType::get(mInt8Ty
, numElms
));
732 // insert an 8 bit value from the high and low lanes of a per loop iteration
734 for(uint32_t i
= 0; i
< numElms
; i
++)
736 ConstantInt
* cLow128b
= cast
<ConstantInt
>(cB
->getAggregateElement(i
));
737 ConstantInt
* cHigh128b
= cast
<ConstantInt
>(cB
->getAggregateElement(i
+ numElms
));
739 // extract values from constant mask
740 char valLow128bLane
= (char)(cLow128b
->getSExtValue());
741 char valHigh128bLane
= (char)(cHigh128b
->getSExtValue());
743 Value
* insertValLow128b
;
744 Value
* insertValHigh128b
;
746 // if the mask value is negative, insert a '0' in the respective output position
747 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
748 insertValLow128b
= (valLow128bLane
< 0) ? C((char)0) : VEXTRACT(a
, C((valLow128bLane
& 0xF)));
749 insertValHigh128b
= (valHigh128bLane
< 0) ? C((char)0) : VEXTRACT(a
, C((valHigh128bLane
& 0xF) + numElms
));
751 vShuf
= VINSERT(vShuf
, insertValLow128b
, i
);
752 vShuf
= VINSERT(vShuf
, insertValHigh128b
, (i
+ numElms
));
759 //////////////////////////////////////////////////////////////////////////
760 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
761 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
762 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
763 /// lower 8 values are used.
764 Value
*Builder::PMOVSXBD(Value
* a
)
766 // llvm-3.9 removed the pmovsxbd intrinsic
767 #if HAVE_LLVM < 0x309
768 // use avx2 byte sign extend instruction if available
769 if(JM()->mArch
.AVX2())
771 Function
*pmovsxbd
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_avx2_pmovsxbd
);
772 return CALL(pmovsxbd
, std::initializer_list
<Value
*>{a
});
777 // VPMOVSXBD output type
778 Type
* v8x32Ty
= VectorType::get(mInt32Ty
, 8);
779 // Extract 8 values from 128bit lane and sign extend
780 return S_EXT(VSHUFFLE(a
, a
, C
<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty
);
784 //////////////////////////////////////////////////////////////////////////
785 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
786 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
787 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
788 Value
*Builder::PMOVSXWD(Value
* a
)
790 // llvm-3.9 removed the pmovsxwd intrinsic
791 #if HAVE_LLVM < 0x309
792 // use avx2 word sign extend if available
793 if(JM()->mArch
.AVX2())
795 Function
*pmovsxwd
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_avx2_pmovsxwd
);
796 return CALL(pmovsxwd
, std::initializer_list
<Value
*>{a
});
801 // VPMOVSXWD output type
802 Type
* v8x32Ty
= VectorType::get(mInt32Ty
, 8);
803 // Extract 8 values from 128bit lane and sign extend
804 return S_EXT(VSHUFFLE(a
, a
, C
<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty
);
808 //////////////////////////////////////////////////////////////////////////
809 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
810 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
811 /// platform, emulate it
812 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
813 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
814 Value
*Builder::PERMD(Value
* a
, Value
* idx
)
817 // use avx2 permute instruction if available
818 if(JM()->mArch
.AVX2())
820 res
= VPERMD(a
, idx
);
824 if (isa
<Constant
>(idx
))
826 res
= VSHUFFLE(a
, a
, idx
);
831 for (uint32_t l
= 0; l
< JM()->mVWidth
; ++l
)
833 Value
* pIndex
= VEXTRACT(idx
, C(l
));
834 Value
* pVal
= VEXTRACT(a
, pIndex
);
835 res
= VINSERT(res
, pVal
, C(l
));
842 //////////////////////////////////////////////////////////////////////////
843 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
844 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
845 /// platform, emulate it
846 /// @param a - 256bit SIMD lane(8x32bit) of float values.
847 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
848 Value
*Builder::PERMPS(Value
* a
, Value
* idx
)
851 // use avx2 permute instruction if available
852 if (JM()->mArch
.AVX2())
854 // llvm 3.6.0 swapped the order of the args to vpermd
855 res
= VPERMPS(idx
, a
);
859 if (isa
<Constant
>(idx
))
861 res
= VSHUFFLE(a
, a
, idx
);
866 for (uint32_t l
= 0; l
< JM()->mVWidth
; ++l
)
868 Value
* pIndex
= VEXTRACT(idx
, C(l
));
869 Value
* pVal
= VEXTRACT(a
, pIndex
);
870 res
= VINSERT(res
, pVal
, C(l
));
878 //////////////////////////////////////////////////////////////////////////
879 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
880 /// in LLVM IR. If not supported on the underlying platform, emulate it
881 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
882 Value
*Builder::CVTPH2PS(Value
* a
)
884 if (JM()->mArch
.F16C())
890 FunctionType
* pFuncTy
= FunctionType::get(mFP32Ty
, mInt16Ty
);
891 Function
* pCvtPh2Ps
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy
));
893 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
895 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32
);
898 Value
* pResult
= UndefValue::get(mSimdFP32Ty
);
899 for (uint32_t i
= 0; i
< mVWidth
; ++i
)
901 Value
* pSrc
= VEXTRACT(a
, C(i
));
902 Value
* pConv
= CALL(pCvtPh2Ps
, std::initializer_list
<Value
*>{pSrc
});
903 pResult
= VINSERT(pResult
, pConv
, C(i
));
910 //////////////////////////////////////////////////////////////////////////
911 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
912 /// in LLVM IR. If not supported on the underlying platform, emulate it
913 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
914 Value
*Builder::CVTPS2PH(Value
* a
, Value
* rounding
)
916 if (JM()->mArch
.F16C())
918 return VCVTPS2PH(a
, rounding
);
922 // call scalar C function for now
923 FunctionType
* pFuncTy
= FunctionType::get(mInt16Ty
, mFP32Ty
);
924 Function
* pCvtPs2Ph
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("Convert32To16Float", pFuncTy
));
926 if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
928 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float
);
931 Value
* pResult
= UndefValue::get(mSimdInt16Ty
);
932 for (uint32_t i
= 0; i
< mVWidth
; ++i
)
934 Value
* pSrc
= VEXTRACT(a
, C(i
));
935 Value
* pConv
= CALL(pCvtPs2Ph
, std::initializer_list
<Value
*>{pSrc
});
936 pResult
= VINSERT(pResult
, pConv
, C(i
));
943 Value
*Builder::PMAXSD(Value
* a
, Value
* b
)
945 // llvm-3.9 removed the pmax intrinsics
946 #if HAVE_LLVM >= 0x309
947 Value
* cmp
= ICMP_SGT(a
, b
);
948 return SELECT(cmp
, a
, b
);
950 if (JM()->mArch
.AVX2())
952 Function
* pmaxsd
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_avx2_pmaxs_d
);
953 return CALL(pmaxsd
, {a
, b
});
957 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
958 Function
* pmaxsd
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_sse41_pmaxsd
);
961 Value
* aLo
= VEXTRACTI128(a
, C((uint8_t)0));
962 Value
* bLo
= VEXTRACTI128(b
, C((uint8_t)0));
963 Value
* resLo
= CALL(pmaxsd
, {aLo
, bLo
});
966 Value
* aHi
= VEXTRACTI128(a
, C((uint8_t)1));
967 Value
* bHi
= VEXTRACTI128(b
, C((uint8_t)1));
968 Value
* resHi
= CALL(pmaxsd
, {aHi
, bHi
});
971 Value
* result
= VINSERTI128(VUNDEF_I(), resLo
, C((uint8_t)0));
972 result
= VINSERTI128(result
, resHi
, C((uint8_t)1));
979 Value
*Builder::PMINSD(Value
* a
, Value
* b
)
981 // llvm-3.9 removed the pmin intrinsics
982 #if HAVE_LLVM >= 0x309
983 Value
* cmp
= ICMP_SLT(a
, b
);
984 return SELECT(cmp
, a
, b
);
986 if (JM()->mArch
.AVX2())
988 Function
* pminsd
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_avx2_pmins_d
);
989 return CALL(pminsd
, {a
, b
});
993 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
994 Function
* pminsd
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_sse41_pminsd
);
997 Value
* aLo
= VEXTRACTI128(a
, C((uint8_t)0));
998 Value
* bLo
= VEXTRACTI128(b
, C((uint8_t)0));
999 Value
* resLo
= CALL(pminsd
, {aLo
, bLo
});
1002 Value
* aHi
= VEXTRACTI128(a
, C((uint8_t)1));
1003 Value
* bHi
= VEXTRACTI128(b
, C((uint8_t)1));
1004 Value
* resHi
= CALL(pminsd
, {aHi
, bHi
});
1007 Value
* result
= VINSERTI128(VUNDEF_I(), resLo
, C((uint8_t)0));
1008 result
= VINSERTI128(result
, resHi
, C((uint8_t)1));
1015 void Builder::Gather4(const SWR_FORMAT format
, Value
* pSrcBase
, Value
* byteOffsets
,
1016 Value
* mask
, Value
* vGatherComponents
[], bool bPackedOutput
)
1018 const SWR_FORMAT_INFO
&info
= GetFormatInfo(format
);
1019 if(info
.type
[0] == SWR_TYPE_FLOAT
&& info
.bpc
[0] == 32)
1021 // ensure our mask is the correct type
1022 mask
= BITCAST(mask
, mSimdFP32Ty
);
1023 GATHER4PS(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
);
1027 // ensure our mask is the correct type
1028 mask
= BITCAST(mask
, mSimdInt32Ty
);
1029 GATHER4DD(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
);
1033 void Builder::GATHER4PS(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
1034 Value
* mask
, Value
* vGatherComponents
[], bool bPackedOutput
)
1036 switch(info
.bpp
/ info
.numComps
)
1040 Value
* vGatherResult
[2];
1043 // TODO: vGatherMaskedVal
1044 Value
* vGatherMaskedVal
= VIMMED1((float)0);
1046 // always have at least one component out of x or y to fetch
1048 // save mask as it is zero'd out after each gather
1051 vGatherResult
[0] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, C((char)1));
1052 // e.g. result of first 8x32bit integer gather for 16bit components
1053 // 256i - 0 1 2 3 4 5 6 7
1054 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1057 // if we have at least one component out of x or y to fetch
1058 if(info
.numComps
> 2)
1060 // offset base to the next components(zw) in the vertex to gather
1061 pSrcBase
= GEP(pSrcBase
, C((char)4));
1064 vGatherResult
[1] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, C((char)1));
1065 // e.g. result of second 8x32bit integer gather for 16bit components
1066 // 256i - 0 1 2 3 4 5 6 7
1067 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1072 vGatherResult
[1] = vGatherMaskedVal
;
1075 // Shuffle gathered components into place, each row is a component
1076 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
1082 for (uint32_t i
= 0; i
< 4; ++i
)
1084 vGatherComponents
[i
] = VIMMED1(*(float*)&info
.defaults
[i
]);
1087 for(uint32_t i
= 0; i
< info
.numComps
; i
++)
1089 uint32_t swizzleIndex
= info
.swizzle
[i
];
1091 // save mask as it is zero'd out after each gather
1092 Value
*vMask
= mask
;
1094 // Gather a SIMD of components
1095 vGatherComponents
[swizzleIndex
] = GATHERPS(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, C((char)1));
1097 // offset base to the next component to gather
1098 pSrcBase
= GEP(pSrcBase
, C((char)4));
1103 SWR_INVALID("Invalid float format");
1108 void Builder::GATHER4DD(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
1109 Value
* mask
, Value
* vGatherComponents
[], bool bPackedOutput
)
1111 switch (info
.bpp
/ info
.numComps
)
1115 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
1116 Value
* vGatherResult
= GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, mask
, C((char)1));
1117 // e.g. result of an 8x32bit integer gather for 8bit components
1118 // 256i - 0 1 2 3 4 5 6 7
1119 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1121 Shuffle8bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
1126 Value
* vGatherResult
[2];
1129 // TODO: vGatherMaskedVal
1130 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
1132 // always have at least one component out of x or y to fetch
1134 // save mask as it is zero'd out after each gather
1137 vGatherResult
[0] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, C((char)1));
1138 // e.g. result of first 8x32bit integer gather for 16bit components
1139 // 256i - 0 1 2 3 4 5 6 7
1140 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1143 // if we have at least one component out of x or y to fetch
1144 if(info
.numComps
> 2)
1146 // offset base to the next components(zw) in the vertex to gather
1147 pSrcBase
= GEP(pSrcBase
, C((char)4));
1150 vGatherResult
[1] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, C((char)1));
1151 // e.g. result of second 8x32bit integer gather for 16bit components
1152 // 256i - 0 1 2 3 4 5 6 7
1153 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1158 vGatherResult
[1] = vGatherMaskedVal
;
1161 // Shuffle gathered components into place, each row is a component
1162 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
1169 for (uint32_t i
= 0; i
< 4; ++i
)
1171 vGatherComponents
[i
] = VIMMED1((int)info
.defaults
[i
]);
1174 for(uint32_t i
= 0; i
< info
.numComps
; i
++)
1176 uint32_t swizzleIndex
= info
.swizzle
[i
];
1178 // save mask as it is zero'd out after each gather
1179 Value
*vMask
= mask
;
1181 // Gather a SIMD of components
1182 vGatherComponents
[swizzleIndex
] = GATHERDD(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, C((char)1));
1184 // offset base to the next component to gather
1185 pSrcBase
= GEP(pSrcBase
, C((char)4));
1190 SWR_INVALID("unsupported format");
1195 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
[2], Value
* vGatherOutput
[4], bool bPackedOutput
)
1198 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
1199 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
1201 // input could either be float or int vector; do shuffle work in int
1202 vGatherInput
[0] = BITCAST(vGatherInput
[0], mSimdInt32Ty
);
1203 vGatherInput
[1] = BITCAST(vGatherInput
[1], mSimdInt32Ty
);
1207 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
1210 Value
* vConstMask
= C
<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1211 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1212 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[0], v32x8Ty
), vConstMask
), vGatherTy
);
1213 // after pshufb: group components together in each 128bit lane
1214 // 256i - 0 1 2 3 4 5 6 7
1215 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1217 Value
* vi128XY
= BITCAST(PERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
1218 // after PERMD: move and pack xy components into each 128bit lane
1219 // 256i - 0 1 2 3 4 5 6 7
1220 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1222 // do the same for zw components
1223 Value
* vi128ZW
= nullptr;
1224 if(info
.numComps
> 2)
1226 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[1], v32x8Ty
), vConstMask
), vGatherTy
);
1227 vi128ZW
= BITCAST(PERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
1230 for(uint32_t i
= 0; i
< 4; i
++)
1232 uint32_t swizzleIndex
= info
.swizzle
[i
];
1233 // todo: fixed for packed
1234 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
1235 if(i
>= info
.numComps
)
1237 // set the default component val
1238 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
1242 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1243 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1244 // if x or y, use vi128XY permute result, else use vi128ZW
1245 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
1247 // extract packed component 128 bit lanes
1248 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
1254 // pshufb masks for each component
1255 Value
* vConstMask
[2];
1257 vConstMask
[0] = C
<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1258 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1261 vConstMask
[1] = C
<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1262 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1265 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1267 for (uint32_t i
= 0; i
< 4; ++i
)
1269 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
1272 for(uint32_t i
= 0; i
< info
.numComps
; i
++)
1274 uint32_t swizzleIndex
= info
.swizzle
[i
];
1276 // select correct constMask for x/z or y/w pshufb
1277 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1278 // if x or y, use vi128XY permute result, else use vi128ZW
1279 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
1281 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
[selectedGather
], v32x8Ty
), vConstMask
[selectedMask
]), vGatherTy
);
1282 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1283 // 256i - 0 1 2 3 4 5 6 7
1284 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1289 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
, Value
* vGatherOutput
[], bool bPackedOutput
)
1292 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
1293 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4 ); // vwidth is units of 32 bits
1297 Type
* v128Ty
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
1299 Value
* vConstMask
= C
<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1300 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1301 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
1302 // after pshufb: group components together in each 128bit lane
1303 // 256i - 0 1 2 3 4 5 6 7
1304 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1306 Value
* vi128XY
= BITCAST(PERMD(vShufResult
, C
<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty
);
1307 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1308 // 256i - 0 1 2 3 4 5 6 7
1309 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1311 // do the same for zw components
1312 Value
* vi128ZW
= nullptr;
1313 if(info
.numComps
> 2)
1315 vi128ZW
= BITCAST(PERMD(vShufResult
, C
<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty
);
1318 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1319 for(uint32_t i
= 0; i
< 4; i
++)
1321 uint32_t swizzleIndex
= info
.swizzle
[i
];
1322 // todo: fix for packed
1323 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
1324 if(i
>= info
.numComps
)
1326 // set the default component val
1327 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
1331 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1332 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1333 // if x or y, use vi128XY permute result, else use vi128ZW
1334 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
1337 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
1342 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1344 for (uint32_t i
= 0; i
< 4; ++i
)
1346 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
1349 for(uint32_t i
= 0; i
< info
.numComps
; i
++){
1350 uint32_t swizzleIndex
= info
.swizzle
[i
];
1352 // pshufb masks for each component
1358 vConstMask
= C
<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1359 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1363 vConstMask
= C
<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1364 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1368 vConstMask
= C
<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1369 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1373 vConstMask
= C
<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1374 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1377 vConstMask
= nullptr;
1381 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
1382 // after pshufb for x channel
1383 // 256i - 0 1 2 3 4 5 6 7
1384 // x000 x000 x000 x000 x000 x000 x000 x000
1389 // Helper function to create alloca in entry block of function
1390 Value
* Builder::CreateEntryAlloca(Function
* pFunc
, Type
* pType
)
1392 auto saveIP
= IRB()->saveIP();
1393 IRB()->SetInsertPoint(&pFunc
->getEntryBlock(),
1394 pFunc
->getEntryBlock().begin());
1395 Value
* pAlloca
= ALLOCA(pType
);
1396 if (saveIP
.isSet()) IRB()->restoreIP(saveIP
);
1400 Value
* Builder::CreateEntryAlloca(Function
* pFunc
, Type
* pType
, Value
* pArraySize
)
1402 auto saveIP
= IRB()->saveIP();
1403 IRB()->SetInsertPoint(&pFunc
->getEntryBlock(),
1404 pFunc
->getEntryBlock().begin());
1405 Value
* pAlloca
= ALLOCA(pType
, pArraySize
);
1406 if (saveIP
.isSet()) IRB()->restoreIP(saveIP
);
1410 //////////////////////////////////////////////////////////////////////////
1411 /// @brief emulates a scatter operation.
1412 /// @param pDst - pointer to destination
1413 /// @param vSrc - vector of src data to scatter
1414 /// @param vOffsets - vector of byte offsets from pDst
1415 /// @param vMask - mask of valid lanes
1416 void Builder::SCATTERPS(Value
* pDst
, Value
* vSrc
, Value
* vOffsets
, Value
* vMask
)
1418 /* Scatter algorithm
1420 while(Index = BitScanForward(mask))
1421 srcElem = srcVector[Index]
1422 offsetElem = offsetVector[Index]
1423 *(pDst + offsetElem) = srcElem
1424 Update mask (&= ~(1<<Index)
1428 BasicBlock
* pCurBB
= IRB()->GetInsertBlock();
1429 Function
* pFunc
= pCurBB
->getParent();
1430 Type
* pSrcTy
= vSrc
->getType()->getVectorElementType();
1432 // Store vectors on stack
1433 if (pScatterStackSrc
== nullptr)
1435 // Save off stack allocations and reuse per scatter. Significantly reduces stack
1436 // requirements for shaders with a lot of scatters.
1437 pScatterStackSrc
= CreateEntryAlloca(pFunc
, mSimdInt64Ty
);
1438 pScatterStackOffsets
= CreateEntryAlloca(pFunc
, mSimdInt32Ty
);
1441 Value
* pSrcArrayPtr
= BITCAST(pScatterStackSrc
, PointerType::get(vSrc
->getType(), 0));
1442 Value
* pOffsetsArrayPtr
= pScatterStackOffsets
;
1443 STORE(vSrc
, pSrcArrayPtr
);
1444 STORE(vOffsets
, pOffsetsArrayPtr
);
1446 // Cast to pointers for random access
1447 pSrcArrayPtr
= POINTER_CAST(pSrcArrayPtr
, PointerType::get(pSrcTy
, 0));
1448 pOffsetsArrayPtr
= POINTER_CAST(pOffsetsArrayPtr
, PointerType::get(mInt32Ty
, 0));
1450 Value
* pMask
= VMOVMSKPS(BITCAST(vMask
, mSimdFP32Ty
));
1452 // Get cttz function
1453 Function
* pfnCttz
= Intrinsic::getDeclaration(mpJitMgr
->mpCurrentModule
, Intrinsic::cttz
, { mInt32Ty
});
1455 // Setup loop basic block
1456 BasicBlock
* pLoop
= BasicBlock::Create(mpJitMgr
->mContext
, "Scatter Loop", pFunc
);
1458 // compute first set bit
1459 Value
* pIndex
= CALL(pfnCttz
, { pMask
, C(false) });
1461 Value
* pIsUndef
= ICMP_EQ(pIndex
, C(32));
1463 // Split current block
1464 BasicBlock
* pPostLoop
= pCurBB
->splitBasicBlock(cast
<Instruction
>(pIsUndef
)->getNextNode());
1466 // Remove unconditional jump created by splitBasicBlock
1467 pCurBB
->getTerminator()->eraseFromParent();
1469 // Add terminator to end of original block
1470 IRB()->SetInsertPoint(pCurBB
);
1472 // Add conditional branch
1473 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
1475 // Add loop basic block contents
1476 IRB()->SetInsertPoint(pLoop
);
1477 PHINode
* pIndexPhi
= PHI(mInt32Ty
, 2);
1478 PHINode
* pMaskPhi
= PHI(mInt32Ty
, 2);
1480 pIndexPhi
->addIncoming(pIndex
, pCurBB
);
1481 pMaskPhi
->addIncoming(pMask
, pCurBB
);
1483 // Extract elements for this index
1484 Value
* pSrcElem
= LOADV(pSrcArrayPtr
, { pIndexPhi
});
1485 Value
* pOffsetElem
= LOADV(pOffsetsArrayPtr
, { pIndexPhi
});
1487 // GEP to this offset in dst
1488 Value
* pCurDst
= GEP(pDst
, pOffsetElem
);
1489 pCurDst
= POINTER_CAST(pCurDst
, PointerType::get(pSrcTy
, 0));
1490 STORE(pSrcElem
, pCurDst
);
1493 Value
* pNewMask
= AND(pMaskPhi
, NOT(SHL(C(1), pIndexPhi
)));
1496 Value
* pNewIndex
= CALL(pfnCttz
, { pNewMask
, C(false) });
1498 pIsUndef
= ICMP_EQ(pNewIndex
, C(32));
1499 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
1502 pIndexPhi
->addIncoming(pNewIndex
, pLoop
);
1503 pMaskPhi
->addIncoming(pNewMask
, pLoop
);
1505 // Move builder to beginning of post loop
1506 IRB()->SetInsertPoint(pPostLoop
, pPostLoop
->begin());
1509 Value
* Builder::VABSPS(Value
* a
)
1511 Value
* asInt
= BITCAST(a
, mSimdInt32Ty
);
1512 Value
* result
= BITCAST(AND(asInt
, VIMMED1(0x7fffffff)), mSimdFP32Ty
);
1516 Value
*Builder::ICLAMP(Value
* src
, Value
* low
, Value
* high
)
1518 Value
*lowCmp
= ICMP_SLT(src
, low
);
1519 Value
*ret
= SELECT(lowCmp
, low
, src
);
1521 Value
*highCmp
= ICMP_SGT(ret
, high
);
1522 ret
= SELECT(highCmp
, high
, ret
);
1527 Value
*Builder::FCLAMP(Value
* src
, Value
* low
, Value
* high
)
1529 Value
*lowCmp
= FCMP_OLT(src
, low
);
1530 Value
*ret
= SELECT(lowCmp
, low
, src
);
1532 Value
*highCmp
= FCMP_OGT(ret
, high
);
1533 ret
= SELECT(highCmp
, high
, ret
);
1538 Value
*Builder::FCLAMP(Value
* src
, float low
, float high
)
1540 Value
* result
= VMAXPS(src
, VIMMED1(low
));
1541 result
= VMINPS(result
, VIMMED1(high
));
1546 //////////////////////////////////////////////////////////////////////////
1547 /// @brief save/restore stack, providing ability to push/pop the stack and
1548 /// reduce overall stack requirements for temporary stack use
1549 Value
* Builder::STACKSAVE()
1551 Function
* pfnStackSave
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::stacksave
);
1552 return CALLA(pfnStackSave
);
1555 void Builder::STACKRESTORE(Value
* pSaved
)
1557 Function
* pfnStackRestore
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::stackrestore
);
1558 CALL(pfnStackRestore
, std::initializer_list
<Value
*>{pSaved
});
1561 Value
*Builder::FMADDPS(Value
* a
, Value
* b
, Value
* c
)
1564 // use FMADs if available
1565 if(JM()->mArch
.AVX2())
1567 vOut
= VFMADDPS(a
, b
, c
);
1571 vOut
= FADD(FMUL(a
, b
), c
);
1576 Value
* Builder::POPCNT(Value
* a
)
1578 Function
* pCtPop
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::ctpop
, { a
->getType() });
1579 return CALL(pCtPop
, std::initializer_list
<Value
*>{a
});
1582 //////////////////////////////////////////////////////////////////////////
1583 /// @brief C functions called by LLVM IR
1584 //////////////////////////////////////////////////////////////////////////
1586 //////////////////////////////////////////////////////////////////////////
1587 /// @brief called in JIT code, inserted by PRINT
1588 /// output to both stdout and visual studio debug console
1589 void __cdecl
CallPrint(const char* fmt
, ...)
1592 va_start(args
, fmt
);
1595 #if defined( _WIN32 )
1597 vsnprintf_s(strBuf
, _TRUNCATE
, fmt
, args
);
1598 OutputDebugString(strBuf
);
1604 Value
*Builder::VEXTRACTI128(Value
* a
, Constant
* imm8
)
1606 bool flag
= !imm8
->isZeroValue();
1607 SmallVector
<Constant
*,8> idx
;
1608 for (unsigned i
= 0; i
< mVWidth
/ 2; i
++) {
1609 idx
.push_back(C(flag
? i
+ mVWidth
/ 2 : i
));
1611 return VSHUFFLE(a
, VUNDEF_I(), ConstantVector::get(idx
));
1614 Value
*Builder::VINSERTI128(Value
* a
, Value
* b
, Constant
* imm8
)
1616 bool flag
= !imm8
->isZeroValue();
1617 SmallVector
<Constant
*,8> idx
;
1618 for (unsigned i
= 0; i
< mVWidth
; i
++) {
1619 idx
.push_back(C(i
));
1621 Value
*inter
= VSHUFFLE(b
, VUNDEF_I(), ConstantVector::get(idx
));
1623 SmallVector
<Constant
*,8> idx2
;
1624 for (unsigned i
= 0; i
< mVWidth
/ 2; i
++) {
1625 idx2
.push_back(C(flag
? i
: i
+ mVWidth
));
1627 for (unsigned i
= mVWidth
/ 2; i
< mVWidth
; i
++) {
1628 idx2
.push_back(C(flag
? i
+ mVWidth
/ 2 : i
));
1630 return VSHUFFLE(a
, inter
, ConstantVector::get(idx2
));
1633 // rdtsc buckets macros
1634 void Builder::RDTSC_START(Value
* pBucketMgr
, Value
* pId
)
1636 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1637 // buckets framework when single threaded
1638 if (KNOB_SINGLE_THREADED
)
1640 std::vector
<Type
*> args
{
1641 PointerType::get(mInt32Ty
, 0), // pBucketMgr
1645 FunctionType
* pFuncTy
= FunctionType::get(Type::getVoidTy(JM()->mContext
), args
, false);
1646 Function
* pFunc
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("BucketManager_StartBucket", pFuncTy
));
1647 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1649 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket
);
1652 CALL(pFunc
, { pBucketMgr
, pId
});
1656 void Builder::RDTSC_STOP(Value
* pBucketMgr
, Value
* pId
)
1658 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1659 // buckets framework when single threaded
1660 if (KNOB_SINGLE_THREADED
)
1662 std::vector
<Type
*> args
{
1663 PointerType::get(mInt32Ty
, 0), // pBucketMgr
1667 FunctionType
* pFuncTy
= FunctionType::get(Type::getVoidTy(JM()->mContext
), args
, false);
1668 Function
* pFunc
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("BucketManager_StopBucket", pFuncTy
));
1669 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1671 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket
);
1674 CALL(pFunc
, { pBucketMgr
, pId
});
1679 uint32_t Builder::GetTypeSize(Type
* pType
)
1681 if (pType
->isStructTy())
1683 uint32_t numElems
= pType
->getStructNumElements();
1684 Type
* pElemTy
= pType
->getStructElementType(0);
1685 return numElems
* GetTypeSize(pElemTy
);
1688 if (pType
->isArrayTy())
1690 uint32_t numElems
= pType
->getArrayNumElements();
1691 Type
* pElemTy
= pType
->getArrayElementType();
1692 return numElems
* GetTypeSize(pElemTy
);
1695 if (pType
->isIntegerTy())
1697 uint32_t bitSize
= pType
->getIntegerBitWidth();
1701 if (pType
->isFloatTy())
1706 if (pType
->isHalfTy())
1711 if (pType
->isDoubleTy())
1716 SWR_ASSERT(false, "Unimplemented type.");