1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file builder_misc.cpp
25 * @brief Implementation for miscellaneous builder functions
29 ******************************************************************************/
30 #include "jit_pch.hpp"
32 #include "common/rdtsc_buckets.h"
38 void __cdecl
CallPrint(const char* fmt
, ...);
40 //////////////////////////////////////////////////////////////////////////
41 /// @brief Convert an IEEE 754 32-bit single precision float to an
42 /// 16 bit float with 5 exponent bits and a variable
43 /// number of mantissa bits.
44 /// @param val - 32-bit float
45 /// @todo Maybe move this outside of this file into a header?
46 static uint16_t ConvertFloat32ToFloat16(float val
)
48 uint32_t sign
, exp
, mant
;
51 // Extract the sign, exponent, and mantissa
52 uint32_t uf
= *(uint32_t*)&val
;
53 sign
= (uf
& 0x80000000) >> 31;
54 exp
= (uf
& 0x7F800000) >> 23;
55 mant
= uf
& 0x007FFFFF;
57 // Check for out of range
62 sign
= 1; // set the sign bit for NANs
64 else if (std::isinf(val
))
69 else if (exp
> (0x70 + 0x1E)) // Too big to represent -> max representable value
74 else if ((exp
<= 0x70) && (exp
>= 0x66)) // It's a denorm
77 for (; exp
<= 0x70; mant
>>= 1, exp
++)
82 else if (exp
< 0x66) // Too small to represent -> Zero
89 // Saves bits that will be shifted off for rounding
90 roundBits
= mant
& 0x1FFFu
;
91 // convert exponent and mantissa to 16 bit format
95 // Essentially RTZ, but round up if off by only 1 lsb
96 if (roundBits
== 0x1FFFu
)
100 if ((mant
& 0xC00u
) != 0)
102 // make sure only the needed bits are used
107 uint32_t tmpVal
= (sign
<< 15) | (exp
<< 10) | mant
;
108 return (uint16_t)tmpVal
;
111 //////////////////////////////////////////////////////////////////////////
112 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
114 /// @param val - 16-bit float
115 /// @todo Maybe move this outside of this file into a header?
116 static float ConvertFloat16ToFloat32(uint32_t val
)
119 if ((val
& 0x7fff) == 0)
121 result
= ((uint32_t)(val
& 0x8000)) << 16;
123 else if ((val
& 0x7c00) == 0x7c00)
125 result
= ((val
& 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
126 result
|= ((uint32_t)val
& 0x8000) << 16;
130 uint32_t sign
= (val
& 0x8000) << 16;
131 uint32_t mant
= (val
& 0x3ff) << 13;
132 uint32_t exp
= (val
>> 10) & 0x1f;
133 if ((exp
== 0) && (mant
!= 0)) // Adjust exponent and mantissa for denormals
136 while (mant
< (0x400 << 13))
141 mant
&= (0x3ff << 13);
143 exp
= ((exp
- 15 + 127) & 0xff) << 23;
144 result
= sign
| exp
| mant
;
147 return *(float*)&result
;
150 Constant
*Builder::C(bool i
)
152 return ConstantInt::get(IRB()->getInt1Ty(), (i
? 1 : 0));
155 Constant
*Builder::C(char i
)
157 return ConstantInt::get(IRB()->getInt8Ty(), i
);
160 Constant
*Builder::C(uint8_t i
)
162 return ConstantInt::get(IRB()->getInt8Ty(), i
);
165 Constant
*Builder::C(int i
)
167 return ConstantInt::get(IRB()->getInt32Ty(), i
);
170 Constant
*Builder::C(int64_t i
)
172 return ConstantInt::get(IRB()->getInt64Ty(), i
);
175 Constant
*Builder::C(uint16_t i
)
177 return ConstantInt::get(mInt16Ty
,i
);
180 Constant
*Builder::C(uint32_t i
)
182 return ConstantInt::get(IRB()->getInt32Ty(), i
);
185 Constant
*Builder::C(float i
)
187 return ConstantFP::get(IRB()->getFloatTy(), i
);
190 Constant
*Builder::PRED(bool pred
)
192 return ConstantInt::get(IRB()->getInt1Ty(), (pred
? 1 : 0));
195 Value
*Builder::VIMMED1(int i
)
197 return ConstantVector::getSplat(mVWidth
, cast
<ConstantInt
>(C(i
)));
200 Value
*Builder::VIMMED1_16(int i
)
202 return ConstantVector::getSplat(mVWidth16
, cast
<ConstantInt
>(C(i
)));
205 Value
*Builder::VIMMED1(uint32_t i
)
207 return ConstantVector::getSplat(mVWidth
, cast
<ConstantInt
>(C(i
)));
210 Value
*Builder::VIMMED1_16(uint32_t i
)
212 return ConstantVector::getSplat(mVWidth16
, cast
<ConstantInt
>(C(i
)));
215 Value
*Builder::VIMMED1(float i
)
217 return ConstantVector::getSplat(mVWidth
, cast
<ConstantFP
>(C(i
)));
220 Value
*Builder::VIMMED1_16(float i
)
222 return ConstantVector::getSplat(mVWidth16
, cast
<ConstantFP
>(C(i
)));
225 Value
*Builder::VIMMED1(bool i
)
227 return ConstantVector::getSplat(mVWidth
, cast
<ConstantInt
>(C(i
)));
230 Value
*Builder::VIMMED1_16(bool i
)
232 return ConstantVector::getSplat(mVWidth16
, cast
<ConstantInt
>(C(i
)));
235 Value
*Builder::VUNDEF_IPTR()
237 return UndefValue::get(VectorType::get(mInt32PtrTy
,mVWidth
));
240 Value
*Builder::VUNDEF(Type
* t
)
242 return UndefValue::get(VectorType::get(t
, mVWidth
));
245 Value
*Builder::VUNDEF_I()
247 return UndefValue::get(VectorType::get(mInt32Ty
, mVWidth
));
250 Value
*Builder::VUNDEF_I_16()
252 return UndefValue::get(VectorType::get(mInt32Ty
, mVWidth16
));
255 Value
*Builder::VUNDEF_F()
257 return UndefValue::get(VectorType::get(mFP32Ty
, mVWidth
));
260 Value
*Builder::VUNDEF_F_16()
262 return UndefValue::get(VectorType::get(mFP32Ty
, mVWidth16
));
265 Value
*Builder::VUNDEF(Type
*ty
, uint32_t size
)
267 return UndefValue::get(VectorType::get(ty
, size
));
270 Value
*Builder::VBROADCAST(Value
*src
, const llvm::Twine
& name
)
272 // check if src is already a vector
273 if (src
->getType()->isVectorTy())
278 return VECTOR_SPLAT(mVWidth
, src
, name
);
281 Value
*Builder::VBROADCAST_16(Value
*src
)
283 // check if src is already a vector
284 if (src
->getType()->isVectorTy())
289 return VECTOR_SPLAT(mVWidth16
, src
);
292 uint32_t Builder::IMMED(Value
* v
)
294 SWR_ASSERT(isa
<ConstantInt
>(v
));
295 ConstantInt
*pValConst
= cast
<ConstantInt
>(v
);
296 return pValConst
->getZExtValue();
299 int32_t Builder::S_IMMED(Value
* v
)
301 SWR_ASSERT(isa
<ConstantInt
>(v
));
302 ConstantInt
*pValConst
= cast
<ConstantInt
>(v
);
303 return pValConst
->getSExtValue();
306 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
308 std::vector
<Value
*> indices
;
309 for (auto i
: indexList
)
310 indices
.push_back(i
);
311 return GEPA(ptr
, indices
);
314 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
316 std::vector
<Value
*> indices
;
317 for (auto i
: indexList
)
318 indices
.push_back(C(i
));
319 return GEPA(ptr
, indices
);
322 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
324 std::vector
<Value
*> indices
;
325 for (auto i
: indexList
)
326 indices
.push_back(i
);
327 return IN_BOUNDS_GEP(ptr
, indices
);
330 Value
*Builder::IN_BOUNDS_GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
332 std::vector
<Value
*> indices
;
333 for (auto i
: indexList
)
334 indices
.push_back(C(i
));
335 return IN_BOUNDS_GEP(ptr
, indices
);
338 LoadInst
*Builder::LOAD(Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
, const llvm::Twine
& name
)
340 std::vector
<Value
*> valIndices
;
341 for (auto i
: indices
)
342 valIndices
.push_back(C(i
));
343 return LOAD(GEPA(basePtr
, valIndices
), name
);
346 LoadInst
*Builder::LOADV(Value
*basePtr
, const std::initializer_list
<Value
*> &indices
, const llvm::Twine
& name
)
348 std::vector
<Value
*> valIndices
;
349 for (auto i
: indices
)
350 valIndices
.push_back(i
);
351 return LOAD(GEPA(basePtr
, valIndices
), name
);
354 StoreInst
*Builder::STORE(Value
*val
, Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
)
356 std::vector
<Value
*> valIndices
;
357 for (auto i
: indices
)
358 valIndices
.push_back(C(i
));
359 return STORE(val
, GEPA(basePtr
, valIndices
));
362 StoreInst
*Builder::STOREV(Value
*val
, Value
*basePtr
, const std::initializer_list
<Value
*> &indices
)
364 std::vector
<Value
*> valIndices
;
365 for (auto i
: indices
)
366 valIndices
.push_back(i
);
367 return STORE(val
, GEPA(basePtr
, valIndices
));
370 CallInst
*Builder::CALL(Value
*Callee
, const std::initializer_list
<Value
*> &argsList
, const llvm::Twine
& name
)
372 std::vector
<Value
*> args
;
373 for (auto arg
: argsList
)
375 return CALLA(Callee
, args
, name
);
378 CallInst
*Builder::CALL(Value
*Callee
, Value
* arg
)
380 std::vector
<Value
*> args
;
382 return CALLA(Callee
, args
);
385 CallInst
*Builder::CALL2(Value
*Callee
, Value
* arg1
, Value
* arg2
)
387 std::vector
<Value
*> args
;
388 args
.push_back(arg1
);
389 args
.push_back(arg2
);
390 return CALLA(Callee
, args
);
393 CallInst
*Builder::CALL3(Value
*Callee
, Value
* arg1
, Value
* arg2
, Value
* arg3
)
395 std::vector
<Value
*> args
;
396 args
.push_back(arg1
);
397 args
.push_back(arg2
);
398 args
.push_back(arg3
);
399 return CALLA(Callee
, args
);
402 //////////////////////////////////////////////////////////////////////////
403 Value
*Builder::DEBUGTRAP()
405 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::debugtrap
);
409 Value
*Builder::VRCP(Value
*va
, const llvm::Twine
& name
)
411 return FDIV(VIMMED1(1.0f
), va
, name
); // 1 / a
414 Value
*Builder::VPLANEPS(Value
* vA
, Value
* vB
, Value
* vC
, Value
* &vX
, Value
* &vY
)
416 Value
* vOut
= FMADDPS(vA
, vX
, vC
);
417 vOut
= FMADDPS(vB
, vY
, vOut
);
421 //////////////////////////////////////////////////////////////////////////
422 /// @brief Generate an i32 masked load operation in LLVM IR. If not
423 /// supported on the underlying platform, emulate it with float masked load
424 /// @param src - base address pointer for the load
425 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
426 Value
*Builder::MASKLOADD(Value
* src
,Value
* mask
)
429 // use avx2 gather instruction is available
430 if(JM()->mArch
.AVX2())
432 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_avx2_maskload_d_256
);
433 vResult
= CALL(func
,{src
,mask
});
437 // maskload intrinsic expects integer mask operand in llvm >= 3.8
438 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
439 mask
= BITCAST(mask
,VectorType::get(mInt32Ty
,mVWidth
));
441 mask
= BITCAST(mask
,VectorType::get(mFP32Ty
,mVWidth
));
443 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
,Intrinsic::x86_avx_maskload_ps_256
);
444 vResult
= BITCAST(CALL(func
,{src
,mask
}), VectorType::get(mInt32Ty
,mVWidth
));
449 //////////////////////////////////////////////////////////////////////////
450 /// @brief insert a JIT call to CallPrint
451 /// - outputs formatted string to both stdout and VS output window
452 /// - DEBUG builds only
454 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
455 /// where C(lane) creates a constant value to print, and pIndex is the Value*
456 /// result from a GEP, printing out the pointer to memory
457 /// @param printStr - constant string to print, which includes format specifiers
458 /// @param printArgs - initializer list of Value*'s to print to std out
459 CallInst
*Builder::PRINT(const std::string
&printStr
,const std::initializer_list
<Value
*> &printArgs
)
461 // push the arguments to CallPrint into a vector
462 std::vector
<Value
*> printCallArgs
;
463 // save room for the format string. we still need to modify it for vectors
464 printCallArgs
.resize(1);
466 // search through the format string for special processing
468 std::string
tempStr(printStr
);
469 pos
= tempStr
.find('%', pos
);
470 auto v
= printArgs
.begin();
472 while ((pos
!= std::string::npos
) && (v
!= printArgs
.end()))
475 Type
* pType
= pArg
->getType();
477 if (pType
->isVectorTy())
479 Type
* pContainedType
= pType
->getContainedType(0);
481 if (toupper(tempStr
[pos
+ 1]) == 'X')
484 tempStr
[pos
+ 1] = 'x';
485 tempStr
.insert(pos
+ 2, "%08X ");
488 printCallArgs
.push_back(VEXTRACT(pArg
, C(0)));
490 std::string vectorFormatStr
;
491 for (uint32_t i
= 1; i
< pType
->getVectorNumElements(); ++i
)
493 vectorFormatStr
+= "0x%08X ";
494 printCallArgs
.push_back(VEXTRACT(pArg
, C(i
)));
497 tempStr
.insert(pos
, vectorFormatStr
);
498 pos
+= vectorFormatStr
.size();
500 else if ((tempStr
[pos
+ 1] == 'f') && (pContainedType
->isFloatTy()))
503 for (; i
< (pArg
->getType()->getVectorNumElements()) - 1; i
++)
505 tempStr
.insert(pos
, std::string("%f "));
507 printCallArgs
.push_back(FP_EXT(VEXTRACT(pArg
, C(i
)), Type::getDoubleTy(JM()->mContext
)));
509 printCallArgs
.push_back(FP_EXT(VEXTRACT(pArg
, C(i
)), Type::getDoubleTy(JM()->mContext
)));
511 else if ((tempStr
[pos
+ 1] == 'd') && (pContainedType
->isIntegerTy()))
514 for (; i
< (pArg
->getType()->getVectorNumElements()) - 1; i
++)
516 tempStr
.insert(pos
, std::string("%d "));
518 printCallArgs
.push_back(VEXTRACT(pArg
, C(i
)));
520 printCallArgs
.push_back(VEXTRACT(pArg
, C(i
)));
525 if (toupper(tempStr
[pos
+ 1]) == 'X')
528 tempStr
.insert(pos
+ 1, "x%08");
529 printCallArgs
.push_back(pArg
);
532 // for %f we need to cast float Values to doubles so that they print out correctly
533 else if ((tempStr
[pos
+ 1] == 'f') && (pType
->isFloatTy()))
535 printCallArgs
.push_back(FP_EXT(pArg
, Type::getDoubleTy(JM()->mContext
)));
540 printCallArgs
.push_back(pArg
);
544 // advance to the next arguement
546 pos
= tempStr
.find('%', ++pos
);
549 // create global variable constant string
550 Constant
*constString
= ConstantDataArray::getString(JM()->mContext
,tempStr
,true);
551 GlobalVariable
*gvPtr
= new GlobalVariable(constString
->getType(),true,GlobalValue::InternalLinkage
,constString
,"printStr");
552 JM()->mpCurrentModule
->getGlobalList().push_back(gvPtr
);
554 // get a pointer to the first character in the constant string array
555 std::vector
<Constant
*> geplist
{C(0),C(0)};
556 Constant
*strGEP
= ConstantExpr::getGetElementPtr(nullptr, gvPtr
,geplist
,false);
558 // insert the pointer to the format string in the argument vector
559 printCallArgs
[0] = strGEP
;
561 // get pointer to CallPrint function and insert decl into the module if needed
562 std::vector
<Type
*> args
;
563 args
.push_back(PointerType::get(mInt8Ty
,0));
564 FunctionType
* callPrintTy
= FunctionType::get(Type::getVoidTy(JM()->mContext
),args
,true);
565 Function
*callPrintFn
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("CallPrint", callPrintTy
));
567 // if we haven't yet added the symbol to the symbol table
568 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
570 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint
);
573 // insert a call to CallPrint
574 return CALLA(callPrintFn
,printCallArgs
);
577 //////////////////////////////////////////////////////////////////////////
578 /// @brief Wrapper around PRINT with initializer list.
579 CallInst
* Builder::PRINT(const std::string
&printStr
)
581 return PRINT(printStr
, {});
584 //////////////////////////////////////////////////////////////////////////
585 /// @brief Generate a masked gather operation in LLVM IR. If not
586 /// supported on the underlying platform, emulate it with loads
587 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
588 /// @param pBase - Int8* base VB address pointer value
589 /// @param vIndices - SIMD wide value of VB byte offsets
590 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
591 /// @param scale - value to scale indices by
592 Value
*Builder::GATHERPS(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
)
596 // use avx2 gather instruction if available
597 if(JM()->mArch
.AVX2())
599 // force mask to <N x float>, required by vgather
600 Value
*mask
= BITCAST(VMASK(vMask
), mSimdFP32Ty
);
602 vGather
= VGATHERPS(vSrc
, pBase
, vIndices
, mask
, C(scale
));
606 Value
* pStack
= STACKSAVE();
608 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
609 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
610 STORE(vSrc
, vSrcPtr
);
612 vGather
= VUNDEF_F();
613 Value
*vScaleVec
= VIMMED1((uint32_t)scale
);
614 Value
*vOffsets
= MUL(vIndices
,vScaleVec
);
615 for(uint32_t i
= 0; i
< mVWidth
; ++i
)
617 // single component byte index
618 Value
*offset
= VEXTRACT(vOffsets
,C(i
));
619 // byte pointer to component
620 Value
*loadAddress
= GEP(pBase
,offset
);
621 loadAddress
= BITCAST(loadAddress
,PointerType::get(mFP32Ty
,0));
622 // pointer to the value to load if we're masking off a component
623 Value
*maskLoadAddress
= GEP(vSrcPtr
,{C(0), C(i
)});
624 Value
*selMask
= VEXTRACT(vMask
,C(i
));
625 // switch in a safe address to load if we're trying to access a vertex
626 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
627 Value
*val
= LOAD(validAddress
);
628 vGather
= VINSERT(vGather
,val
,C(i
));
631 STACKRESTORE(pStack
);
637 Value
*Builder::GATHERPS_16(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
)
639 Value
*vGather
= VUNDEF_F_16();
641 // use AVX512F gather instruction if available
642 if (JM()->mArch
.AVX512F())
644 // force mask to <N-bit Integer>, required by vgather2
645 Value
*mask
= BITCAST(vMask
, mInt16Ty
);
647 vGather
= VGATHERPS_16(vSrc
, pBase
, vIndices
, mask
, C((uint32_t)scale
));
651 Value
*src0
= EXTRACT_16(vSrc
, 0);
652 Value
*src1
= EXTRACT_16(vSrc
, 1);
654 Value
*indices0
= EXTRACT_16(vIndices
, 0);
655 Value
*indices1
= EXTRACT_16(vIndices
, 1);
657 Value
*mask0
= EXTRACT_16(vMask
, 0);
658 Value
*mask1
= EXTRACT_16(vMask
, 1);
660 Value
*gather0
= GATHERPS(src0
, pBase
, indices0
, mask0
, scale
);
661 Value
*gather1
= GATHERPS(src1
, pBase
, indices1
, mask1
, scale
);
663 vGather
= JOIN_16(gather0
, gather1
);
669 //////////////////////////////////////////////////////////////////////////
670 /// @brief Generate a masked gather operation in LLVM IR. If not
671 /// supported on the underlying platform, emulate it with loads
672 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
673 /// @param pBase - Int8* base VB address pointer value
674 /// @param vIndices - SIMD wide value of VB byte offsets
675 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
676 /// @param scale - value to scale indices by
677 Value
*Builder::GATHERDD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
)
681 // use avx2 gather instruction if available
682 if(JM()->mArch
.AVX2())
684 vGather
= VGATHERDD(vSrc
, pBase
, vIndices
, VMASK(vMask
), C(scale
));
688 Value
* pStack
= STACKSAVE();
690 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
691 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
692 STORE(vSrc
, vSrcPtr
);
694 vGather
= VUNDEF_I();
695 Value
*vScaleVec
= VIMMED1((uint32_t)scale
);
696 Value
*vOffsets
= MUL(vIndices
, vScaleVec
);
697 for(uint32_t i
= 0; i
< mVWidth
; ++i
)
699 // single component byte index
700 Value
*offset
= VEXTRACT(vOffsets
, C(i
));
701 // byte pointer to component
702 Value
*loadAddress
= GEP(pBase
, offset
);
703 loadAddress
= BITCAST(loadAddress
, PointerType::get(mInt32Ty
, 0));
704 // pointer to the value to load if we're masking off a component
705 Value
*maskLoadAddress
= GEP(vSrcPtr
, {C(0), C(i
)});
706 Value
*selMask
= VEXTRACT(vMask
, C(i
));
707 // switch in a safe address to load if we're trying to access a vertex
708 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
709 Value
*val
= LOAD(validAddress
, C(0));
710 vGather
= VINSERT(vGather
, val
, C(i
));
713 STACKRESTORE(pStack
);
719 Value
*Builder::GATHERDD_16(Value
*vSrc
, Value
*pBase
, Value
*vIndices
, Value
*vMask
, uint8_t scale
)
721 Value
*vGather
= VUNDEF_I_16();
723 // use AVX512F gather instruction if available
724 if (JM()->mArch
.AVX512F())
726 // force mask to <N-bit Integer>, required by vgather2
727 Value
*mask
= BITCAST(vMask
, mInt16Ty
);
729 vGather
= VGATHERDD_16(vSrc
, pBase
, vIndices
, mask
, C((uint32_t)scale
));
733 Value
*src0
= EXTRACT_16(vSrc
, 0);
734 Value
*src1
= EXTRACT_16(vSrc
, 1);
736 Value
*indices0
= EXTRACT_16(vIndices
, 0);
737 Value
*indices1
= EXTRACT_16(vIndices
, 1);
739 Value
*mask0
= EXTRACT_16(vMask
, 0);
740 Value
*mask1
= EXTRACT_16(vMask
, 1);
742 Value
*gather0
= GATHERDD(src0
, pBase
, indices0
, mask0
, scale
);
743 Value
*gather1
= GATHERDD(src1
, pBase
, indices1
, mask1
, scale
);
745 vGather
= JOIN_16(gather0
, gather1
);
751 //////////////////////////////////////////////////////////////////////////
752 /// @brief Generate a masked gather operation in LLVM IR. If not
753 /// supported on the underlying platform, emulate it with loads
754 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
755 /// @param pBase - Int8* base VB address pointer value
756 /// @param vIndices - SIMD wide value of VB byte offsets
757 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
758 /// @param scale - value to scale indices by
759 Value
*Builder::GATHERPD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, uint8_t scale
)
763 // use avx2 gather instruction if available
764 if(JM()->mArch
.AVX2())
766 vMask
= BITCAST(S_EXT(vMask
, VectorType::get(mInt64Ty
, mVWidth
/2)), VectorType::get(mDoubleTy
, mVWidth
/2));
767 vGather
= VGATHERPD(vSrc
, pBase
, vIndices
, vMask
, C(scale
));
771 Value
* pStack
= STACKSAVE();
773 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
774 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
775 STORE(vSrc
, vSrcPtr
);
777 vGather
= UndefValue::get(VectorType::get(mDoubleTy
, 4));
778 Value
*vScaleVec
= VECTOR_SPLAT(4, C((uint32_t)scale
));
779 Value
*vOffsets
= MUL(vIndices
,vScaleVec
);
780 for(uint32_t i
= 0; i
< mVWidth
/2; ++i
)
782 // single component byte index
783 Value
*offset
= VEXTRACT(vOffsets
,C(i
));
784 // byte pointer to component
785 Value
*loadAddress
= GEP(pBase
,offset
);
786 loadAddress
= BITCAST(loadAddress
,PointerType::get(mDoubleTy
,0));
787 // pointer to the value to load if we're masking off a component
788 Value
*maskLoadAddress
= GEP(vSrcPtr
,{C(0), C(i
)});
789 Value
*selMask
= VEXTRACT(vMask
,C(i
));
790 // switch in a safe address to load if we're trying to access a vertex
791 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
792 Value
*val
= LOAD(validAddress
);
793 vGather
= VINSERT(vGather
,val
,C(i
));
795 STACKRESTORE(pStack
);
800 Value
*Builder::EXTRACT_16(Value
*x
, uint32_t imm
)
804 return VSHUFFLE(x
, UndefValue::get(x
->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 });
808 return VSHUFFLE(x
, UndefValue::get(x
->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 });
812 Value
*Builder::JOIN_16(Value
*a
, Value
*b
)
814 return VSHUFFLE(a
, b
, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
817 //////////////////////////////////////////////////////////////////////////
818 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
819 Value
*Builder::MASK(Value
*vmask
)
821 Value
*src
= BITCAST(vmask
, mSimdInt32Ty
);
822 return ICMP_SLT(src
, VIMMED1(0));
825 Value
*Builder::MASK_16(Value
*vmask
)
827 Value
*src
= BITCAST(vmask
, mSimd16Int32Ty
);
828 return ICMP_SLT(src
, VIMMED1_16(0));
831 //////////////////////////////////////////////////////////////////////////
832 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
833 Value
*Builder::VMASK(Value
*mask
)
835 return S_EXT(mask
, mSimdInt32Ty
);
838 Value
*Builder::VMASK_16(Value
*mask
)
840 return S_EXT(mask
, mSimd16Int32Ty
);
843 //////////////////////////////////////////////////////////////////////////
844 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
845 /// supported on the underlying platform, emulate it
846 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
847 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
848 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
849 /// 128bits of a, and vice versa for the upper lanes. If the mask
850 /// value is negative, '0' is inserted.
851 Value
*Builder::PSHUFB(Value
* a
, Value
* b
)
854 // use avx2 pshufb instruction if available
855 if(JM()->mArch
.AVX2())
861 Constant
* cB
= dyn_cast
<Constant
>(b
);
862 // number of 8 bit elements in b
863 uint32_t numElms
= cast
<VectorType
>(cB
->getType())->getNumElements();
865 Value
* vShuf
= UndefValue::get(VectorType::get(mInt8Ty
, numElms
));
867 // insert an 8 bit value from the high and low lanes of a per loop iteration
869 for(uint32_t i
= 0; i
< numElms
; i
++)
871 ConstantInt
* cLow128b
= cast
<ConstantInt
>(cB
->getAggregateElement(i
));
872 ConstantInt
* cHigh128b
= cast
<ConstantInt
>(cB
->getAggregateElement(i
+ numElms
));
874 // extract values from constant mask
875 char valLow128bLane
= (char)(cLow128b
->getSExtValue());
876 char valHigh128bLane
= (char)(cHigh128b
->getSExtValue());
878 Value
* insertValLow128b
;
879 Value
* insertValHigh128b
;
881 // if the mask value is negative, insert a '0' in the respective output position
882 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
883 insertValLow128b
= (valLow128bLane
< 0) ? C((char)0) : VEXTRACT(a
, C((valLow128bLane
& 0xF)));
884 insertValHigh128b
= (valHigh128bLane
< 0) ? C((char)0) : VEXTRACT(a
, C((valHigh128bLane
& 0xF) + numElms
));
886 vShuf
= VINSERT(vShuf
, insertValLow128b
, i
);
887 vShuf
= VINSERT(vShuf
, insertValHigh128b
, (i
+ numElms
));
894 //////////////////////////////////////////////////////////////////////////
895 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
896 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
897 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
898 /// lower 8 values are used.
899 Value
*Builder::PMOVSXBD(Value
* a
)
901 // VPMOVSXBD output type
902 Type
* v8x32Ty
= VectorType::get(mInt32Ty
, 8);
903 // Extract 8 values from 128bit lane and sign extend
904 return S_EXT(VSHUFFLE(a
, a
, C
<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty
);
907 //////////////////////////////////////////////////////////////////////////
908 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
909 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
910 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
911 Value
*Builder::PMOVSXWD(Value
* a
)
913 // VPMOVSXWD output type
914 Type
* v8x32Ty
= VectorType::get(mInt32Ty
, 8);
915 // Extract 8 values from 128bit lane and sign extend
916 return S_EXT(VSHUFFLE(a
, a
, C
<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty
);
919 //////////////////////////////////////////////////////////////////////////
920 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
921 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
922 /// platform, emulate it
923 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
924 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
925 Value
*Builder::PERMD(Value
* a
, Value
* idx
)
928 // use avx2 permute instruction if available
929 if(JM()->mArch
.AVX2())
931 res
= VPERMD(a
, idx
);
935 if (isa
<Constant
>(idx
))
937 res
= VSHUFFLE(a
, a
, idx
);
942 for (uint32_t l
= 0; l
< JM()->mVWidth
; ++l
)
944 Value
* pIndex
= VEXTRACT(idx
, C(l
));
945 Value
* pVal
= VEXTRACT(a
, pIndex
);
946 res
= VINSERT(res
, pVal
, C(l
));
953 //////////////////////////////////////////////////////////////////////////
954 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
955 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
956 /// platform, emulate it
957 /// @param a - 256bit SIMD lane(8x32bit) of float values.
958 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
959 Value
*Builder::PERMPS(Value
* a
, Value
* idx
)
962 // use avx2 permute instruction if available
963 if (JM()->mArch
.AVX2())
965 // llvm 3.6.0 swapped the order of the args to vpermd
966 res
= VPERMPS(idx
, a
);
970 if (isa
<Constant
>(idx
))
972 res
= VSHUFFLE(a
, a
, idx
);
977 for (uint32_t l
= 0; l
< JM()->mVWidth
; ++l
)
979 Value
* pIndex
= VEXTRACT(idx
, C(l
));
980 Value
* pVal
= VEXTRACT(a
, pIndex
);
981 res
= VINSERT(res
, pVal
, C(l
));
989 //////////////////////////////////////////////////////////////////////////
990 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
991 /// in LLVM IR. If not supported on the underlying platform, emulate it
992 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
993 Value
*Builder::CVTPH2PS(Value
* a
, const llvm::Twine
& name
)
995 if (JM()->mArch
.F16C())
997 return VCVTPH2PS(a
, name
);
1001 FunctionType
* pFuncTy
= FunctionType::get(mFP32Ty
, mInt16Ty
);
1002 Function
* pCvtPh2Ps
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy
));
1004 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
1006 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32
);
1009 Value
* pResult
= UndefValue::get(mSimdFP32Ty
);
1010 for (uint32_t i
= 0; i
< mVWidth
; ++i
)
1012 Value
* pSrc
= VEXTRACT(a
, C(i
));
1013 Value
* pConv
= CALL(pCvtPh2Ps
, std::initializer_list
<Value
*>{pSrc
});
1014 pResult
= VINSERT(pResult
, pConv
, C(i
));
1017 pResult
->setName(name
);
1022 //////////////////////////////////////////////////////////////////////////
1023 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
1024 /// in LLVM IR. If not supported on the underlying platform, emulate it
1025 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
1026 Value
*Builder::CVTPS2PH(Value
* a
, Value
* rounding
)
1028 if (JM()->mArch
.F16C())
1030 return VCVTPS2PH(a
, rounding
);
1034 // call scalar C function for now
1035 FunctionType
* pFuncTy
= FunctionType::get(mInt16Ty
, mFP32Ty
);
1036 Function
* pCvtPs2Ph
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy
));
1038 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
1040 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16
);
1043 Value
* pResult
= UndefValue::get(mSimdInt16Ty
);
1044 for (uint32_t i
= 0; i
< mVWidth
; ++i
)
1046 Value
* pSrc
= VEXTRACT(a
, C(i
));
1047 Value
* pConv
= CALL(pCvtPs2Ph
, std::initializer_list
<Value
*>{pSrc
});
1048 pResult
= VINSERT(pResult
, pConv
, C(i
));
1055 Value
*Builder::PMAXSD(Value
* a
, Value
* b
)
1057 Value
* cmp
= ICMP_SGT(a
, b
);
1058 return SELECT(cmp
, a
, b
);
1061 Value
*Builder::PMINSD(Value
* a
, Value
* b
)
1063 Value
* cmp
= ICMP_SLT(a
, b
);
1064 return SELECT(cmp
, a
, b
);
1067 void Builder::Gather4(const SWR_FORMAT format
, Value
* pSrcBase
, Value
* byteOffsets
,
1068 Value
* mask
, Value
* vGatherComponents
[], bool bPackedOutput
)
1070 const SWR_FORMAT_INFO
&info
= GetFormatInfo(format
);
1071 if(info
.type
[0] == SWR_TYPE_FLOAT
&& info
.bpc
[0] == 32)
1073 GATHER4PS(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
);
1077 GATHER4DD(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
);
1081 void Builder::GATHER4PS(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
1082 Value
* vMask
, Value
* vGatherComponents
[], bool bPackedOutput
)
1084 switch(info
.bpp
/ info
.numComps
)
1088 Value
* vGatherResult
[2];
1090 // TODO: vGatherMaskedVal
1091 Value
* vGatherMaskedVal
= VIMMED1((float)0);
1093 // always have at least one component out of x or y to fetch
1095 vGatherResult
[0] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
1096 // e.g. result of first 8x32bit integer gather for 16bit components
1097 // 256i - 0 1 2 3 4 5 6 7
1098 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1101 // if we have at least one component out of x or y to fetch
1102 if(info
.numComps
> 2)
1104 // offset base to the next components(zw) in the vertex to gather
1105 pSrcBase
= GEP(pSrcBase
, C((char)4));
1107 vGatherResult
[1] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
1108 // e.g. result of second 8x32bit integer gather for 16bit components
1109 // 256i - 0 1 2 3 4 5 6 7
1110 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1115 vGatherResult
[1] = vGatherMaskedVal
;
1118 // Shuffle gathered components into place, each row is a component
1119 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
1125 for (uint32_t i
= 0; i
< 4; ++i
)
1127 vGatherComponents
[i
] = VIMMED1(*(float*)&info
.defaults
[i
]);
1130 for(uint32_t i
= 0; i
< info
.numComps
; i
++)
1132 uint32_t swizzleIndex
= info
.swizzle
[i
];
1134 // Gather a SIMD of components
1135 vGatherComponents
[swizzleIndex
] = GATHERPS(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
);
1137 // offset base to the next component to gather
1138 pSrcBase
= GEP(pSrcBase
, C((char)4));
1143 SWR_INVALID("Invalid float format");
1148 void Builder::GATHER4DD(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
1149 Value
* vMask
, Value
* vGatherComponents
[], bool bPackedOutput
)
1151 switch (info
.bpp
/ info
.numComps
)
1155 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
1156 Value
* vGatherResult
= GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
1157 // e.g. result of an 8x32bit integer gather for 8bit components
1158 // 256i - 0 1 2 3 4 5 6 7
1159 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1161 Shuffle8bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
1166 Value
* vGatherResult
[2];
1168 // TODO: vGatherMaskedVal
1169 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
1171 // always have at least one component out of x or y to fetch
1173 vGatherResult
[0] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
1174 // e.g. result of first 8x32bit integer gather for 16bit components
1175 // 256i - 0 1 2 3 4 5 6 7
1176 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1179 // if we have at least one component out of x or y to fetch
1180 if(info
.numComps
> 2)
1182 // offset base to the next components(zw) in the vertex to gather
1183 pSrcBase
= GEP(pSrcBase
, C((char)4));
1185 vGatherResult
[1] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
);
1186 // e.g. result of second 8x32bit integer gather for 16bit components
1187 // 256i - 0 1 2 3 4 5 6 7
1188 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1193 vGatherResult
[1] = vGatherMaskedVal
;
1196 // Shuffle gathered components into place, each row is a component
1197 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
1204 for (uint32_t i
= 0; i
< 4; ++i
)
1206 vGatherComponents
[i
] = VIMMED1((int)info
.defaults
[i
]);
1209 for(uint32_t i
= 0; i
< info
.numComps
; i
++)
1211 uint32_t swizzleIndex
= info
.swizzle
[i
];
1213 // Gather a SIMD of components
1214 vGatherComponents
[swizzleIndex
] = GATHERDD(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
);
1216 // offset base to the next component to gather
1217 pSrcBase
= GEP(pSrcBase
, C((char)4));
1222 SWR_INVALID("unsupported format");
1227 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
[2], Value
* vGatherOutput
[4], bool bPackedOutput
)
1230 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
1231 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4); // vwidth is units of 32 bits
1233 // input could either be float or int vector; do shuffle work in int
1234 vGatherInput
[0] = BITCAST(vGatherInput
[0], mSimdInt32Ty
);
1235 vGatherInput
[1] = BITCAST(vGatherInput
[1], mSimdInt32Ty
);
1239 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
1242 Value
* vConstMask
= C
<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1243 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1244 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[0], v32x8Ty
), vConstMask
), vGatherTy
);
1245 // after pshufb: group components together in each 128bit lane
1246 // 256i - 0 1 2 3 4 5 6 7
1247 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1249 Value
* vi128XY
= BITCAST(PERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
1250 // after PERMD: move and pack xy components into each 128bit lane
1251 // 256i - 0 1 2 3 4 5 6 7
1252 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1254 // do the same for zw components
1255 Value
* vi128ZW
= nullptr;
1256 if(info
.numComps
> 2)
1258 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[1], v32x8Ty
), vConstMask
), vGatherTy
);
1259 vi128ZW
= BITCAST(PERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
1262 for(uint32_t i
= 0; i
< 4; i
++)
1264 uint32_t swizzleIndex
= info
.swizzle
[i
];
1265 // todo: fixed for packed
1266 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
1267 if(i
>= info
.numComps
)
1269 // set the default component val
1270 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
1274 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1275 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1276 // if x or y, use vi128XY permute result, else use vi128ZW
1277 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
1279 // extract packed component 128 bit lanes
1280 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
1286 // pshufb masks for each component
1287 Value
* vConstMask
[2];
1289 vConstMask
[0] = C
<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1290 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1293 vConstMask
[1] = C
<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1294 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1297 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1299 for (uint32_t i
= 0; i
< 4; ++i
)
1301 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
1304 for(uint32_t i
= 0; i
< info
.numComps
; i
++)
1306 uint32_t swizzleIndex
= info
.swizzle
[i
];
1308 // select correct constMask for x/z or y/w pshufb
1309 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1310 // if x or y, use vi128XY permute result, else use vi128ZW
1311 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
1313 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
[selectedGather
], v32x8Ty
), vConstMask
[selectedMask
]), vGatherTy
);
1314 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1315 // 256i - 0 1 2 3 4 5 6 7
1316 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1321 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
, Value
* vGatherOutput
[], bool bPackedOutput
)
1324 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), mVWidth
);
1325 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, mVWidth
* 4 ); // vwidth is units of 32 bits
1329 Type
* v128Ty
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), mVWidth
/ 4); // vwidth is units of 32 bits
1331 Value
* vConstMask
= C
<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1332 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1333 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
1334 // after pshufb: group components together in each 128bit lane
1335 // 256i - 0 1 2 3 4 5 6 7
1336 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1338 Value
* vi128XY
= BITCAST(PERMD(vShufResult
, C
<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty
);
1339 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1340 // 256i - 0 1 2 3 4 5 6 7
1341 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1343 // do the same for zw components
1344 Value
* vi128ZW
= nullptr;
1345 if(info
.numComps
> 2)
1347 vi128ZW
= BITCAST(PERMD(vShufResult
, C
<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty
);
1350 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1351 for(uint32_t i
= 0; i
< 4; i
++)
1353 uint32_t swizzleIndex
= info
.swizzle
[i
];
1354 // todo: fix for packed
1355 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
1356 if(i
>= info
.numComps
)
1358 // set the default component val
1359 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
1363 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1364 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1365 // if x or y, use vi128XY permute result, else use vi128ZW
1366 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
1369 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
1374 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1376 for (uint32_t i
= 0; i
< 4; ++i
)
1378 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
1381 for(uint32_t i
= 0; i
< info
.numComps
; i
++){
1382 uint32_t swizzleIndex
= info
.swizzle
[i
];
1384 // pshufb masks for each component
1390 vConstMask
= C
<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1391 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1395 vConstMask
= C
<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1396 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1400 vConstMask
= C
<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1401 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1405 vConstMask
= C
<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1406 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1409 vConstMask
= nullptr;
1413 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
1414 // after pshufb for x channel
1415 // 256i - 0 1 2 3 4 5 6 7
1416 // x000 x000 x000 x000 x000 x000 x000 x000
1421 // Helper function to create alloca in entry block of function
1422 Value
* Builder::CreateEntryAlloca(Function
* pFunc
, Type
* pType
)
1424 auto saveIP
= IRB()->saveIP();
1425 IRB()->SetInsertPoint(&pFunc
->getEntryBlock(),
1426 pFunc
->getEntryBlock().begin());
1427 Value
* pAlloca
= ALLOCA(pType
);
1428 if (saveIP
.isSet()) IRB()->restoreIP(saveIP
);
1432 Value
* Builder::CreateEntryAlloca(Function
* pFunc
, Type
* pType
, Value
* pArraySize
)
1434 auto saveIP
= IRB()->saveIP();
1435 IRB()->SetInsertPoint(&pFunc
->getEntryBlock(),
1436 pFunc
->getEntryBlock().begin());
1437 Value
* pAlloca
= ALLOCA(pType
, pArraySize
);
1438 if (saveIP
.isSet()) IRB()->restoreIP(saveIP
);
1442 //////////////////////////////////////////////////////////////////////////
1443 /// @brief emulates a scatter operation.
1444 /// @param pDst - pointer to destination
1445 /// @param vSrc - vector of src data to scatter
1446 /// @param vOffsets - vector of byte offsets from pDst
1447 /// @param vMask - mask of valid lanes
1448 void Builder::SCATTERPS(Value
* pDst
, Value
* vSrc
, Value
* vOffsets
, Value
* vMask
)
1450 /* Scatter algorithm
1452 while(Index = BitScanForward(mask))
1453 srcElem = srcVector[Index]
1454 offsetElem = offsetVector[Index]
1455 *(pDst + offsetElem) = srcElem
1456 Update mask (&= ~(1<<Index)
1460 BasicBlock
* pCurBB
= IRB()->GetInsertBlock();
1461 Function
* pFunc
= pCurBB
->getParent();
1462 Type
* pSrcTy
= vSrc
->getType()->getVectorElementType();
1464 // Store vectors on stack
1465 if (pScatterStackSrc
== nullptr)
1467 // Save off stack allocations and reuse per scatter. Significantly reduces stack
1468 // requirements for shaders with a lot of scatters.
1469 pScatterStackSrc
= CreateEntryAlloca(pFunc
, mSimdInt64Ty
);
1470 pScatterStackOffsets
= CreateEntryAlloca(pFunc
, mSimdInt32Ty
);
1473 Value
* pSrcArrayPtr
= BITCAST(pScatterStackSrc
, PointerType::get(vSrc
->getType(), 0));
1474 Value
* pOffsetsArrayPtr
= pScatterStackOffsets
;
1475 STORE(vSrc
, pSrcArrayPtr
);
1476 STORE(vOffsets
, pOffsetsArrayPtr
);
1478 // Cast to pointers for random access
1479 pSrcArrayPtr
= POINTER_CAST(pSrcArrayPtr
, PointerType::get(pSrcTy
, 0));
1480 pOffsetsArrayPtr
= POINTER_CAST(pOffsetsArrayPtr
, PointerType::get(mInt32Ty
, 0));
1482 Value
* pMask
= VMOVMSKPS(BITCAST(vMask
, mSimdFP32Ty
));
1484 // Get cttz function
1485 Function
* pfnCttz
= Intrinsic::getDeclaration(mpJitMgr
->mpCurrentModule
, Intrinsic::cttz
, { mInt32Ty
});
1487 // Setup loop basic block
1488 BasicBlock
* pLoop
= BasicBlock::Create(mpJitMgr
->mContext
, "Scatter_Loop", pFunc
);
1490 // compute first set bit
1491 Value
* pIndex
= CALL(pfnCttz
, { pMask
, C(false) });
1493 Value
* pIsUndef
= ICMP_EQ(pIndex
, C(32));
1495 // Split current block
1496 BasicBlock
* pPostLoop
= pCurBB
->splitBasicBlock(cast
<Instruction
>(pIsUndef
)->getNextNode());
1498 // Remove unconditional jump created by splitBasicBlock
1499 pCurBB
->getTerminator()->eraseFromParent();
1501 // Add terminator to end of original block
1502 IRB()->SetInsertPoint(pCurBB
);
1504 // Add conditional branch
1505 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
1507 // Add loop basic block contents
1508 IRB()->SetInsertPoint(pLoop
);
1509 PHINode
* pIndexPhi
= PHI(mInt32Ty
, 2);
1510 PHINode
* pMaskPhi
= PHI(mInt32Ty
, 2);
1512 pIndexPhi
->addIncoming(pIndex
, pCurBB
);
1513 pMaskPhi
->addIncoming(pMask
, pCurBB
);
1515 // Extract elements for this index
1516 Value
* pSrcElem
= LOADV(pSrcArrayPtr
, { pIndexPhi
});
1517 Value
* pOffsetElem
= LOADV(pOffsetsArrayPtr
, { pIndexPhi
});
1519 // GEP to this offset in dst
1520 Value
* pCurDst
= GEP(pDst
, pOffsetElem
);
1521 pCurDst
= POINTER_CAST(pCurDst
, PointerType::get(pSrcTy
, 0));
1522 STORE(pSrcElem
, pCurDst
);
1525 Value
* pNewMask
= AND(pMaskPhi
, NOT(SHL(C(1), pIndexPhi
)));
1528 Value
* pNewIndex
= CALL(pfnCttz
, { pNewMask
, C(false) });
1530 pIsUndef
= ICMP_EQ(pNewIndex
, C(32));
1531 COND_BR(pIsUndef
, pPostLoop
, pLoop
);
1534 pIndexPhi
->addIncoming(pNewIndex
, pLoop
);
1535 pMaskPhi
->addIncoming(pNewMask
, pLoop
);
1537 // Move builder to beginning of post loop
1538 IRB()->SetInsertPoint(pPostLoop
, pPostLoop
->begin());
1541 Value
* Builder::VABSPS(Value
* a
)
1543 Value
* asInt
= BITCAST(a
, mSimdInt32Ty
);
1544 Value
* result
= BITCAST(AND(asInt
, VIMMED1(0x7fffffff)), mSimdFP32Ty
);
1548 Value
*Builder::ICLAMP(Value
* src
, Value
* low
, Value
* high
, const llvm::Twine
& name
)
1550 Value
*lowCmp
= ICMP_SLT(src
, low
);
1551 Value
*ret
= SELECT(lowCmp
, low
, src
);
1553 Value
*highCmp
= ICMP_SGT(ret
, high
);
1554 ret
= SELECT(highCmp
, high
, ret
, name
);
1559 Value
*Builder::FCLAMP(Value
* src
, Value
* low
, Value
* high
)
1561 Value
*lowCmp
= FCMP_OLT(src
, low
);
1562 Value
*ret
= SELECT(lowCmp
, low
, src
);
1564 Value
*highCmp
= FCMP_OGT(ret
, high
);
1565 ret
= SELECT(highCmp
, high
, ret
);
1570 Value
*Builder::FCLAMP(Value
* src
, float low
, float high
)
1572 Value
* result
= VMAXPS(src
, VIMMED1(low
));
1573 result
= VMINPS(result
, VIMMED1(high
));
1578 //////////////////////////////////////////////////////////////////////////
1579 /// @brief save/restore stack, providing ability to push/pop the stack and
1580 /// reduce overall stack requirements for temporary stack use
1581 Value
* Builder::STACKSAVE()
1583 Function
* pfnStackSave
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::stacksave
);
1584 return CALLA(pfnStackSave
);
1587 void Builder::STACKRESTORE(Value
* pSaved
)
1589 Function
* pfnStackRestore
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::stackrestore
);
1590 CALL(pfnStackRestore
, std::initializer_list
<Value
*>{pSaved
});
1593 Value
*Builder::FMADDPS(Value
* a
, Value
* b
, Value
* c
)
1596 // use FMADs if available
1597 if(JM()->mArch
.AVX2())
1599 vOut
= VFMADDPS(a
, b
, c
);
1603 vOut
= FADD(FMUL(a
, b
), c
);
1608 Value
* Builder::POPCNT(Value
* a
)
1610 Function
* pCtPop
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::ctpop
, { a
->getType() });
1611 return CALL(pCtPop
, std::initializer_list
<Value
*>{a
});
1614 //////////////////////////////////////////////////////////////////////////
1615 /// @brief C functions called by LLVM IR
1616 //////////////////////////////////////////////////////////////////////////
1618 //////////////////////////////////////////////////////////////////////////
1619 /// @brief called in JIT code, inserted by PRINT
1620 /// output to both stdout and visual studio debug console
1621 void __cdecl
CallPrint(const char* fmt
, ...)
1624 va_start(args
, fmt
);
1627 #if defined( _WIN32 )
1629 vsnprintf_s(strBuf
, _TRUNCATE
, fmt
, args
);
1630 OutputDebugStringA(strBuf
);
1636 Value
*Builder::VEXTRACTI128(Value
* a
, Constant
* imm8
)
1638 bool flag
= !imm8
->isZeroValue();
1639 SmallVector
<Constant
*,8> idx
;
1640 for (unsigned i
= 0; i
< mVWidth
/ 2; i
++) {
1641 idx
.push_back(C(flag
? i
+ mVWidth
/ 2 : i
));
1643 return VSHUFFLE(a
, VUNDEF_I(), ConstantVector::get(idx
));
1646 Value
*Builder::VINSERTI128(Value
* a
, Value
* b
, Constant
* imm8
)
1648 bool flag
= !imm8
->isZeroValue();
1649 SmallVector
<Constant
*,8> idx
;
1650 for (unsigned i
= 0; i
< mVWidth
; i
++) {
1651 idx
.push_back(C(i
));
1653 Value
*inter
= VSHUFFLE(b
, VUNDEF_I(), ConstantVector::get(idx
));
1655 SmallVector
<Constant
*,8> idx2
;
1656 for (unsigned i
= 0; i
< mVWidth
/ 2; i
++) {
1657 idx2
.push_back(C(flag
? i
: i
+ mVWidth
));
1659 for (unsigned i
= mVWidth
/ 2; i
< mVWidth
; i
++) {
1660 idx2
.push_back(C(flag
? i
+ mVWidth
/ 2 : i
));
1662 return VSHUFFLE(a
, inter
, ConstantVector::get(idx2
));
1665 // rdtsc buckets macros
1666 void Builder::RDTSC_START(Value
* pBucketMgr
, Value
* pId
)
1668 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1669 // buckets framework when single threaded
1670 if (KNOB_SINGLE_THREADED
)
1672 std::vector
<Type
*> args
{
1673 PointerType::get(mInt32Ty
, 0), // pBucketMgr
1677 FunctionType
* pFuncTy
= FunctionType::get(Type::getVoidTy(JM()->mContext
), args
, false);
1678 Function
* pFunc
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("BucketManager_StartBucket", pFuncTy
));
1679 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1681 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket
);
1684 CALL(pFunc
, { pBucketMgr
, pId
});
1688 void Builder::RDTSC_STOP(Value
* pBucketMgr
, Value
* pId
)
1690 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1691 // buckets framework when single threaded
1692 if (KNOB_SINGLE_THREADED
)
1694 std::vector
<Type
*> args
{
1695 PointerType::get(mInt32Ty
, 0), // pBucketMgr
1699 FunctionType
* pFuncTy
= FunctionType::get(Type::getVoidTy(JM()->mContext
), args
, false);
1700 Function
* pFunc
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("BucketManager_StopBucket", pFuncTy
));
1701 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1703 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket
);
1706 CALL(pFunc
, { pBucketMgr
, pId
});
1711 uint32_t Builder::GetTypeSize(Type
* pType
)
1713 if (pType
->isStructTy())
1715 uint32_t numElems
= pType
->getStructNumElements();
1716 Type
* pElemTy
= pType
->getStructElementType(0);
1717 return numElems
* GetTypeSize(pElemTy
);
1720 if (pType
->isArrayTy())
1722 uint32_t numElems
= pType
->getArrayNumElements();
1723 Type
* pElemTy
= pType
->getArrayElementType();
1724 return numElems
* GetTypeSize(pElemTy
);
1727 if (pType
->isIntegerTy())
1729 uint32_t bitSize
= pType
->getIntegerBitWidth();
1733 if (pType
->isFloatTy())
1738 if (pType
->isHalfTy())
1743 if (pType
->isDoubleTy())
1748 SWR_ASSERT(false, "Unimplemented type.");