1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file builder_misc.cpp
25 * @brief Implementation for miscellaneous builder functions
29 ******************************************************************************/
31 #include "llvm/Support/DynamicLibrary.h"
33 void __cdecl
CallPrint(const char* fmt
, ...);
35 //////////////////////////////////////////////////////////////////////////
36 /// @brief Convert an IEEE 754 32-bit single precision float to an
37 /// 16 bit float with 5 exponent bits and a variable
38 /// number of mantissa bits.
39 /// @param val - 32-bit float
40 /// @todo Maybe move this outside of this file into a header?
41 static uint16_t Convert32To16Float(float val
)
43 uint32_t sign
, exp
, mant
;
46 // Extract the sign, exponent, and mantissa
47 uint32_t uf
= *(uint32_t*)&val
;
48 sign
= (uf
& 0x80000000) >> 31;
49 exp
= (uf
& 0x7F800000) >> 23;
50 mant
= uf
& 0x007FFFFF;
52 // Check for out of range
57 sign
= 1; // set the sign bit for NANs
59 else if (std::isinf(val
))
64 else if (exp
> (0x70 + 0x1E)) // Too big to represent -> max representable value
69 else if ((exp
<= 0x70) && (exp
>= 0x66)) // It's a denorm
72 for (; exp
<= 0x70; mant
>>= 1, exp
++)
77 else if (exp
< 0x66) // Too small to represent -> Zero
84 // Saves bits that will be shifted off for rounding
85 roundBits
= mant
& 0x1FFFu
;
86 // convert exponent and mantissa to 16 bit format
90 // Essentially RTZ, but round up if off by only 1 lsb
91 if (roundBits
== 0x1FFFu
)
95 if ((mant
& 0xC00u
) != 0)
97 // make sure only the needed bits are used
102 uint32_t tmpVal
= (sign
<< 15) | (exp
<< 10) | mant
;
103 return (uint16_t)tmpVal
;
106 //////////////////////////////////////////////////////////////////////////
107 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
109 /// @param val - 16-bit float
110 /// @todo Maybe move this outside of this file into a header?
111 static float ConvertSmallFloatTo32(UINT val
)
114 if ((val
& 0x7fff) == 0)
116 result
= ((uint32_t)(val
& 0x8000)) << 16;
118 else if ((val
& 0x7c00) == 0x7c00)
120 result
= ((val
& 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
121 result
|= ((uint32_t)val
& 0x8000) << 16;
125 uint32_t sign
= (val
& 0x8000) << 16;
126 uint32_t mant
= (val
& 0x3ff) << 13;
127 uint32_t exp
= (val
>> 10) & 0x1f;
128 if ((exp
== 0) && (mant
!= 0)) // Adjust exponent and mantissa for denormals
131 while (mant
< (0x400 << 13))
136 mant
&= (0x3ff << 13);
138 exp
= ((exp
- 15 + 127) & 0xff) << 23;
139 result
= sign
| exp
| mant
;
142 return *(float*)&result
;
145 Constant
*Builder::C(bool i
)
147 return ConstantInt::get(IRB()->getInt1Ty(), (i
? 1 : 0));
150 Constant
*Builder::C(char i
)
152 return ConstantInt::get(IRB()->getInt8Ty(), i
);
155 Constant
*Builder::C(uint8_t i
)
157 return ConstantInt::get(IRB()->getInt8Ty(), i
);
160 Constant
*Builder::C(int i
)
162 return ConstantInt::get(IRB()->getInt32Ty(), i
);
165 Constant
*Builder::C(int64_t i
)
167 return ConstantInt::get(IRB()->getInt64Ty(), i
);
170 Constant
*Builder::C(uint16_t i
)
172 return ConstantInt::get(mInt16Ty
,i
);
175 Constant
*Builder::C(uint32_t i
)
177 return ConstantInt::get(IRB()->getInt32Ty(), i
);
180 Constant
*Builder::C(float i
)
182 return ConstantFP::get(IRB()->getFloatTy(), i
);
185 Constant
*Builder::PRED(bool pred
)
187 return ConstantInt::get(IRB()->getInt1Ty(), (pred
? 1 : 0));
190 Value
*Builder::VIMMED1(int i
)
192 return ConstantVector::getSplat(JM()->mVWidth
, cast
<ConstantInt
>(C(i
)));
195 Value
*Builder::VIMMED1(uint32_t i
)
197 return ConstantVector::getSplat(JM()->mVWidth
, cast
<ConstantInt
>(C(i
)));
200 Value
*Builder::VIMMED1(float i
)
202 return ConstantVector::getSplat(JM()->mVWidth
, cast
<ConstantFP
>(C(i
)));
205 Value
*Builder::VIMMED1(bool i
)
207 return ConstantVector::getSplat(JM()->mVWidth
, cast
<ConstantInt
>(C(i
)));
210 Value
*Builder::VUNDEF_IPTR()
212 return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty
, 0),JM()->mVWidth
));
215 Value
*Builder::VUNDEF_I()
217 return UndefValue::get(VectorType::get(mInt32Ty
, JM()->mVWidth
));
220 Value
*Builder::VUNDEF(Type
*ty
, uint32_t size
)
222 return UndefValue::get(VectorType::get(ty
, size
));
225 Value
*Builder::VUNDEF_F()
227 return UndefValue::get(VectorType::get(mFP32Ty
, JM()->mVWidth
));
230 Value
*Builder::VUNDEF(Type
* t
)
232 return UndefValue::get(VectorType::get(t
, JM()->mVWidth
));
235 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
236 Value
*Builder::VINSERT(Value
*vec
, Value
*val
, uint64_t index
)
238 return VINSERT(vec
, val
, C((int64_t)index
));
242 Value
*Builder::VBROADCAST(Value
*src
)
244 // check if src is already a vector
245 if (src
->getType()->isVectorTy())
250 return VECTOR_SPLAT(JM()->mVWidth
, src
);
253 uint32_t Builder::IMMED(Value
* v
)
255 SWR_ASSERT(isa
<ConstantInt
>(v
));
256 ConstantInt
*pValConst
= cast
<ConstantInt
>(v
);
257 return pValConst
->getZExtValue();
260 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<Value
*> &indexList
)
262 std::vector
<Value
*> indices
;
263 for (auto i
: indexList
)
264 indices
.push_back(i
);
265 return GEPA(ptr
, indices
);
268 Value
*Builder::GEP(Value
* ptr
, const std::initializer_list
<uint32_t> &indexList
)
270 std::vector
<Value
*> indices
;
271 for (auto i
: indexList
)
272 indices
.push_back(C(i
));
273 return GEPA(ptr
, indices
);
276 LoadInst
*Builder::LOAD(Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
, const llvm::Twine
& name
)
278 std::vector
<Value
*> valIndices
;
279 for (auto i
: indices
)
280 valIndices
.push_back(C(i
));
281 return LOAD(GEPA(basePtr
, valIndices
), name
);
284 LoadInst
*Builder::LOADV(Value
*basePtr
, const std::initializer_list
<Value
*> &indices
, const llvm::Twine
& name
)
286 std::vector
<Value
*> valIndices
;
287 for (auto i
: indices
)
288 valIndices
.push_back(i
);
289 return LOAD(GEPA(basePtr
, valIndices
), name
);
292 StoreInst
*Builder::STORE(Value
*val
, Value
*basePtr
, const std::initializer_list
<uint32_t> &indices
)
294 std::vector
<Value
*> valIndices
;
295 for (auto i
: indices
)
296 valIndices
.push_back(C(i
));
297 return STORE(val
, GEPA(basePtr
, valIndices
));
300 StoreInst
*Builder::STOREV(Value
*val
, Value
*basePtr
, const std::initializer_list
<Value
*> &indices
)
302 std::vector
<Value
*> valIndices
;
303 for (auto i
: indices
)
304 valIndices
.push_back(i
);
305 return STORE(val
, GEPA(basePtr
, valIndices
));
308 CallInst
*Builder::CALL(Value
*Callee
, const std::initializer_list
<Value
*> &argsList
)
310 std::vector
<Value
*> args
;
311 for (auto arg
: argsList
)
313 return CALLA(Callee
, args
);
316 Value
*Builder::VRCP(Value
*va
)
318 return FDIV(VIMMED1(1.0f
), va
); // 1 / a
321 Value
*Builder::VPLANEPS(Value
* vA
, Value
* vB
, Value
* vC
, Value
* &vX
, Value
* &vY
)
323 Value
* vOut
= FMADDPS(vA
, vX
, vC
);
324 vOut
= FMADDPS(vB
, vY
, vOut
);
328 //////////////////////////////////////////////////////////////////////////
329 /// @brief Generate an i32 masked load operation in LLVM IR. If not
330 /// supported on the underlying platform, emulate it with float masked load
331 /// @param src - base address pointer for the load
332 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
333 Value
*Builder::MASKLOADD(Value
* src
,Value
* mask
)
336 // use avx2 gather instruction is available
337 if(JM()->mArch
.AVX2())
339 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_avx2_maskload_d_256
);
340 vResult
= CALL(func
,{src
,mask
});
344 Function
*func
= Intrinsic::getDeclaration(JM()->mpCurrentModule
,Intrinsic::x86_avx_maskload_ps_256
);
345 Value
* fMask
= BITCAST(mask
,VectorType::get(mFP32Ty
,JM()->mVWidth
));
346 vResult
= BITCAST(CALL(func
,{src
,fMask
}), VectorType::get(mInt32Ty
,JM()->mVWidth
));
351 //////////////////////////////////////////////////////////////////////////
352 /// @brief insert a JIT call to CallPrint
353 /// - outputs formatted string to both stdout and VS output window
354 /// - DEBUG builds only
356 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
357 /// where C(lane) creates a constant value to print, and pIndex is the Value*
358 /// result from a GEP, printing out the pointer to memory
359 /// @param printStr - constant string to print, which includes format specifiers
360 /// @param printArgs - initializer list of Value*'s to print to std out
361 CallInst
*Builder::PRINT(const std::string
&printStr
,const std::initializer_list
<Value
*> &printArgs
)
363 // push the arguments to CallPrint into a vector
364 std::vector
<Value
*> printCallArgs
;
365 // save room for the format string. we still need to modify it for vectors
366 printCallArgs
.resize(1);
368 // search through the format string for special processing
370 std::string
tempStr(printStr
);
371 pos
= tempStr
.find('%', pos
);
372 auto v
= printArgs
.begin();
374 while ((pos
!= std::string::npos
) && (v
!= printArgs
.end()))
377 Type
* pType
= pArg
->getType();
379 if (tempStr
[pos
+ 1] == 't')
381 if (pType
->isVectorTy())
383 Type
* pContainedType
= pType
->getContainedType(0);
385 std::string vectorFormatStr
;
387 if (pContainedType
->isFloatTy())
389 tempStr
[pos
+ 1] = 'f'; // Ensure its %f
390 printCallArgs
.push_back(FP_EXT(VEXTRACT(pArg
, C(0)), mDoubleTy
));
392 for (uint32_t i
= 1; i
< pType
->getVectorNumElements(); ++i
)
394 vectorFormatStr
+= "%f ";
395 printCallArgs
.push_back(FP_EXT(VEXTRACT(pArg
, C(i
)), mDoubleTy
));
398 else if (pContainedType
->isIntegerTy())
400 tempStr
[pos
+ 1] = 'd'; // Ensure its %d
401 printCallArgs
.push_back(VEXTRACT(pArg
, C(0)));
403 for (uint32_t i
= 1; i
< pType
->getVectorNumElements(); ++i
)
405 vectorFormatStr
+= "%d ";
406 printCallArgs
.push_back(VEXTRACT(pArg
, C(i
)));
411 SWR_ASSERT(0, "Unsupported tyep");
414 tempStr
.insert(pos
, vectorFormatStr
);
415 pos
+= vectorFormatStr
.size();
419 if (pType
->isFloatTy())
421 tempStr
[pos
+ 1] = 'f'; // Ensure its %f
422 printCallArgs
.push_back(FP_EXT(pArg
, mDoubleTy
));
424 else if (pType
->isIntegerTy())
426 tempStr
[pos
+ 1] = 'd'; // Ensure its %d
427 printCallArgs
.push_back(pArg
);
431 else if (toupper(tempStr
[pos
+ 1]) == 'X')
433 if (pType
->isVectorTy())
436 tempStr
.insert(pos
+ 1, "x%08");
438 printCallArgs
.push_back(VEXTRACT(pArg
, C(0)));
440 std::string vectorFormatStr
;
441 for (uint32_t i
= 1; i
< pType
->getVectorNumElements(); ++i
)
443 vectorFormatStr
+= "0x%08X ";
444 printCallArgs
.push_back(VEXTRACT(pArg
, C(i
)));
447 tempStr
.insert(pos
, vectorFormatStr
);
448 pos
+= vectorFormatStr
.size();
453 tempStr
.insert(pos
+ 1, "x%08");
454 printCallArgs
.push_back(pArg
);
458 // for %f we need to cast float Values to doubles so that they print out correctly
459 else if ((tempStr
[pos
+ 1] == 'f') && (pType
->isFloatTy()))
461 printCallArgs
.push_back(FP_EXT(pArg
, Type::getDoubleTy(JM()->mContext
)));
464 // add special handling for %f and %d format specifiers to make printing llvm vector types easier
465 else if (pType
->isVectorTy())
467 Type
* pContainedType
= pType
->getContainedType(0);
469 if ((tempStr
[pos
+ 1] == 'f') && (pContainedType
->isFloatTy()))
472 for (; i
< (pArg
->getType()->getVectorNumElements()) - 1; i
++)
474 tempStr
.insert(pos
, std::string("%f "));
476 printCallArgs
.push_back(FP_EXT(VEXTRACT(pArg
, C(i
)), Type::getDoubleTy(JM()->mContext
)));
478 printCallArgs
.push_back(FP_EXT(VEXTRACT(pArg
, C(i
)), Type::getDoubleTy(JM()->mContext
)));
480 else if ((tempStr
[pos
+ 1] == 'd') && (pContainedType
->isIntegerTy()))
483 for (; i
< (pArg
->getType()->getVectorNumElements()) - 1; i
++)
485 tempStr
.insert(pos
, std::string("%d "));
487 printCallArgs
.push_back(VEXTRACT(pArg
, C(i
)));
489 printCallArgs
.push_back(VEXTRACT(pArg
, C(i
)));
493 /// not a supported vector to print
494 /// @todo pointer types too
500 printCallArgs
.push_back(pArg
);
503 // advance to the next arguement
505 pos
= tempStr
.find('%', ++pos
);
508 // create global variable constant string
509 Constant
*constString
= ConstantDataArray::getString(JM()->mContext
,tempStr
,true);
510 GlobalVariable
*gvPtr
= new GlobalVariable(constString
->getType(),true,GlobalValue::InternalLinkage
,constString
,"printStr");
511 JM()->mpCurrentModule
->getGlobalList().push_back(gvPtr
);
513 // get a pointer to the first character in the constant string array
514 std::vector
<Constant
*> geplist
{C(0),C(0)};
515 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
516 Constant
*strGEP
= ConstantExpr::getGetElementPtr(gvPtr
,geplist
,false);
518 Constant
*strGEP
= ConstantExpr::getGetElementPtr(nullptr, gvPtr
,geplist
,false);
521 // insert the pointer to the format string in the argument vector
522 printCallArgs
[0] = strGEP
;
524 // get pointer to CallPrint function and insert decl into the module if needed
525 std::vector
<Type
*> args
;
526 args
.push_back(PointerType::get(mInt8Ty
,0));
527 FunctionType
* callPrintTy
= FunctionType::get(Type::getVoidTy(JM()->mContext
),args
,true);
528 Function
*callPrintFn
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("CallPrint", callPrintTy
));
530 // if we haven't yet added the symbol to the symbol table
531 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
533 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint
);
536 // insert a call to CallPrint
537 return CALLA(callPrintFn
,printCallArgs
);
540 //////////////////////////////////////////////////////////////////////////
541 /// @brief Wrapper around PRINT with initializer list.
542 CallInst
* Builder::PRINT(const std::string
&printStr
)
544 return PRINT(printStr
, {});
547 //////////////////////////////////////////////////////////////////////////
548 /// @brief Generate a masked gather operation in LLVM IR. If not
549 /// supported on the underlying platform, emulate it with loads
550 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
551 /// @param pBase - Int8* base VB address pointer value
552 /// @param vIndices - SIMD wide value of VB byte offsets
553 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
554 /// @param scale - value to scale indices by
555 Value
*Builder::GATHERPS(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, Value
* scale
)
559 // use avx2 gather instruction if available
560 if(JM()->mArch
.AVX2())
562 // force mask to <N x float>, required by vgather
563 vMask
= BITCAST(vMask
, mSimdFP32Ty
);
564 vGather
= VGATHERPS(vSrc
,pBase
,vIndices
,vMask
,scale
);
568 Value
* pStack
= STACKSAVE();
570 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
571 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
572 STORE(vSrc
, vSrcPtr
);
574 vGather
= VUNDEF_F();
575 Value
*vScaleVec
= VBROADCAST(Z_EXT(scale
,mInt32Ty
));
576 Value
*vOffsets
= MUL(vIndices
,vScaleVec
);
577 Value
*mask
= MASK(vMask
);
578 for(uint32_t i
= 0; i
< JM()->mVWidth
; ++i
)
580 // single component byte index
581 Value
*offset
= VEXTRACT(vOffsets
,C(i
));
582 // byte pointer to component
583 Value
*loadAddress
= GEP(pBase
,offset
);
584 loadAddress
= BITCAST(loadAddress
,PointerType::get(mFP32Ty
,0));
585 // pointer to the value to load if we're masking off a component
586 Value
*maskLoadAddress
= GEP(vSrcPtr
,{C(0), C(i
)});
587 Value
*selMask
= VEXTRACT(mask
,C(i
));
588 // switch in a safe address to load if we're trying to access a vertex
589 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
590 Value
*val
= LOAD(validAddress
);
591 vGather
= VINSERT(vGather
,val
,C(i
));
593 STACKRESTORE(pStack
);
599 //////////////////////////////////////////////////////////////////////////
600 /// @brief Generate a masked gather operation in LLVM IR. If not
601 /// supported on the underlying platform, emulate it with loads
602 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
603 /// @param pBase - Int8* base VB address pointer value
604 /// @param vIndices - SIMD wide value of VB byte offsets
605 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
606 /// @param scale - value to scale indices by
607 Value
*Builder::GATHERDD(Value
* vSrc
, Value
* pBase
, Value
* vIndices
, Value
* vMask
, Value
* scale
)
611 // use avx2 gather instruction if available
612 if(JM()->mArch
.AVX2())
614 vGather
= VGATHERDD(vSrc
, pBase
, vIndices
, vMask
, scale
);
618 Value
* pStack
= STACKSAVE();
620 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
621 Value
* vSrcPtr
= ALLOCA(vSrc
->getType());
622 STORE(vSrc
, vSrcPtr
);
624 vGather
= VUNDEF_I();
625 Value
*vScaleVec
= VBROADCAST(Z_EXT(scale
, mInt32Ty
));
626 Value
*vOffsets
= MUL(vIndices
, vScaleVec
);
627 Value
*mask
= MASK(vMask
);
628 for(uint32_t i
= 0; i
< JM()->mVWidth
; ++i
)
630 // single component byte index
631 Value
*offset
= VEXTRACT(vOffsets
, C(i
));
632 // byte pointer to component
633 Value
*loadAddress
= GEP(pBase
, offset
);
634 loadAddress
= BITCAST(loadAddress
, PointerType::get(mInt32Ty
, 0));
635 // pointer to the value to load if we're masking off a component
636 Value
*maskLoadAddress
= GEP(vSrcPtr
, {C(0), C(i
)});
637 Value
*selMask
= VEXTRACT(mask
, C(i
));
638 // switch in a safe address to load if we're trying to access a vertex
639 Value
*validAddress
= SELECT(selMask
, loadAddress
, maskLoadAddress
);
640 Value
*val
= LOAD(validAddress
, C(0));
641 vGather
= VINSERT(vGather
, val
, C(i
));
644 STACKRESTORE(pStack
);
649 //////////////////////////////////////////////////////////////////////////
650 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
651 Value
* Builder::MASK(Value
* vmask
)
653 Value
* src
= BITCAST(vmask
, mSimdInt32Ty
);
654 return ICMP_SLT(src
, VIMMED1(0));
657 //////////////////////////////////////////////////////////////////////////
658 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
659 Value
* Builder::VMASK(Value
* mask
)
661 return S_EXT(mask
, mSimdInt32Ty
);
664 //////////////////////////////////////////////////////////////////////////
665 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
666 /// supported on the underlying platform, emulate it
667 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
668 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
669 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
670 /// 128bits of a, and vice versa for the upper lanes. If the mask
671 /// value is negative, '0' is inserted.
672 Value
*Builder::PSHUFB(Value
* a
, Value
* b
)
675 // use avx2 pshufb instruction if available
676 if(JM()->mArch
.AVX2())
682 Constant
* cB
= dyn_cast
<Constant
>(b
);
683 // number of 8 bit elements in b
684 uint32_t numElms
= cast
<VectorType
>(cB
->getType())->getNumElements();
686 Value
* vShuf
= UndefValue::get(VectorType::get(mInt8Ty
, numElms
));
688 // insert an 8 bit value from the high and low lanes of a per loop iteration
690 for(uint32_t i
= 0; i
< numElms
; i
++)
692 ConstantInt
* cLow128b
= cast
<ConstantInt
>(cB
->getAggregateElement(i
));
693 ConstantInt
* cHigh128b
= cast
<ConstantInt
>(cB
->getAggregateElement(i
+ numElms
));
695 // extract values from constant mask
696 char valLow128bLane
= (char)(cLow128b
->getSExtValue());
697 char valHigh128bLane
= (char)(cHigh128b
->getSExtValue());
699 Value
* insertValLow128b
;
700 Value
* insertValHigh128b
;
702 // if the mask value is negative, insert a '0' in the respective output position
703 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
704 insertValLow128b
= (valLow128bLane
< 0) ? C((char)0) : VEXTRACT(a
, C((valLow128bLane
& 0xF)));
705 insertValHigh128b
= (valHigh128bLane
< 0) ? C((char)0) : VEXTRACT(a
, C((valHigh128bLane
& 0xF) + numElms
));
707 vShuf
= VINSERT(vShuf
, insertValLow128b
, i
);
708 vShuf
= VINSERT(vShuf
, insertValHigh128b
, (i
+ numElms
));
715 //////////////////////////////////////////////////////////////////////////
716 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
717 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
718 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
719 /// lower 8 values are used.
720 Value
*Builder::PMOVSXBD(Value
* a
)
723 // use avx2 byte sign extend instruction if available
724 if(JM()->mArch
.AVX2())
730 // VPMOVSXBD output type
731 Type
* v8x32Ty
= VectorType::get(mInt32Ty
, 8);
732 // Extract 8 values from 128bit lane and sign extend
733 res
= S_EXT(VSHUFFLE(a
, a
, C
<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty
);
738 //////////////////////////////////////////////////////////////////////////
739 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
740 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
741 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
742 Value
*Builder::PMOVSXWD(Value
* a
)
745 // use avx2 word sign extend if available
746 if(JM()->mArch
.AVX2())
752 // VPMOVSXWD output type
753 Type
* v8x32Ty
= VectorType::get(mInt32Ty
, 8);
754 // Extract 8 values from 128bit lane and sign extend
755 res
= S_EXT(VSHUFFLE(a
, a
, C
<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty
);
760 //////////////////////////////////////////////////////////////////////////
761 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
762 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
763 /// platform, emulate it
764 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
765 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
766 Value
*Builder::PERMD(Value
* a
, Value
* idx
)
769 // use avx2 permute instruction if available
770 if(JM()->mArch
.AVX2())
772 // llvm 3.6.0 swapped the order of the args to vpermd
773 res
= VPERMD(idx
, a
);
777 res
= VSHUFFLE(a
, a
, idx
);
782 //////////////////////////////////////////////////////////////////////////
783 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
784 /// in LLVM IR. If not supported on the underlying platform, emulate it
785 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
786 Value
*Builder::CVTPH2PS(Value
* a
)
788 if (JM()->mArch
.F16C())
794 FunctionType
* pFuncTy
= FunctionType::get(mFP32Ty
, mInt16Ty
);
795 Function
* pCvtPh2Ps
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy
));
797 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
799 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32
);
802 Value
* pResult
= UndefValue::get(mSimdFP32Ty
);
803 for (uint32_t i
= 0; i
< JM()->mVWidth
; ++i
)
805 Value
* pSrc
= VEXTRACT(a
, C(i
));
806 Value
* pConv
= CALL(pCvtPh2Ps
, std::initializer_list
<Value
*>{pSrc
});
807 pResult
= VINSERT(pResult
, pConv
, C(i
));
814 //////////////////////////////////////////////////////////////////////////
815 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
816 /// in LLVM IR. If not supported on the underlying platform, emulate it
817 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
818 Value
*Builder::CVTPS2PH(Value
* a
, Value
* rounding
)
820 if (JM()->mArch
.F16C())
822 return VCVTPS2PH(a
, rounding
);
826 // call scalar C function for now
827 FunctionType
* pFuncTy
= FunctionType::get(mInt16Ty
, mFP32Ty
);
828 Function
* pCvtPs2Ph
= cast
<Function
>(JM()->mpCurrentModule
->getOrInsertFunction("Convert32To16Float", pFuncTy
));
830 if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
832 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float
);
835 Value
* pResult
= UndefValue::get(mSimdInt16Ty
);
836 for (uint32_t i
= 0; i
< JM()->mVWidth
; ++i
)
838 Value
* pSrc
= VEXTRACT(a
, C(i
));
839 Value
* pConv
= CALL(pCvtPs2Ph
, std::initializer_list
<Value
*>{pSrc
});
840 pResult
= VINSERT(pResult
, pConv
, C(i
));
847 Value
*Builder::PMAXSD(Value
* a
, Value
* b
)
849 if (JM()->mArch
.AVX2())
851 return VPMAXSD(a
, b
);
855 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
856 Function
* pmaxsd
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_sse41_pmaxsd
);
859 Value
* aLo
= VEXTRACTI128(a
, C((uint8_t)0));
860 Value
* bLo
= VEXTRACTI128(b
, C((uint8_t)0));
861 Value
* resLo
= CALL(pmaxsd
, {aLo
, bLo
});
864 Value
* aHi
= VEXTRACTI128(a
, C((uint8_t)1));
865 Value
* bHi
= VEXTRACTI128(b
, C((uint8_t)1));
866 Value
* resHi
= CALL(pmaxsd
, {aHi
, bHi
});
869 Value
* result
= VINSERTI128(VUNDEF_I(), resLo
, C((uint8_t)0));
870 result
= VINSERTI128(result
, resHi
, C((uint8_t)1));
876 Value
*Builder::PMINSD(Value
* a
, Value
* b
)
878 if (JM()->mArch
.AVX2())
880 return VPMINSD(a
, b
);
884 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
885 Function
* pminsd
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::x86_sse41_pminsd
);
888 Value
* aLo
= VEXTRACTI128(a
, C((uint8_t)0));
889 Value
* bLo
= VEXTRACTI128(b
, C((uint8_t)0));
890 Value
* resLo
= CALL(pminsd
, {aLo
, bLo
});
893 Value
* aHi
= VEXTRACTI128(a
, C((uint8_t)1));
894 Value
* bHi
= VEXTRACTI128(b
, C((uint8_t)1));
895 Value
* resHi
= CALL(pminsd
, {aHi
, bHi
});
898 Value
* result
= VINSERTI128(VUNDEF_I(), resLo
, C((uint8_t)0));
899 result
= VINSERTI128(result
, resHi
, C((uint8_t)1));
905 void Builder::Gather4(const SWR_FORMAT format
, Value
* pSrcBase
, Value
* byteOffsets
,
906 Value
* mask
, Value
* vGatherComponents
[], bool bPackedOutput
)
908 const SWR_FORMAT_INFO
&info
= GetFormatInfo(format
);
909 if(info
.type
[0] == SWR_TYPE_FLOAT
&& info
.bpc
[0] == 32)
911 // ensure our mask is the correct type
912 mask
= BITCAST(mask
, mSimdFP32Ty
);
913 GATHER4PS(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
);
917 // ensure our mask is the correct type
918 mask
= BITCAST(mask
, mSimdInt32Ty
);
919 GATHER4DD(info
, pSrcBase
, byteOffsets
, mask
, vGatherComponents
, bPackedOutput
);
923 void Builder::GATHER4PS(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
924 Value
* mask
, Value
* vGatherComponents
[], bool bPackedOutput
)
926 switch(info
.bpp
/ info
.numComps
)
930 Value
* vGatherResult
[2];
933 // TODO: vGatherMaskedVal
934 Value
* vGatherMaskedVal
= VIMMED1((float)0);
936 // always have at least one component out of x or y to fetch
938 // save mask as it is zero'd out after each gather
941 vGatherResult
[0] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, C((char)1));
942 // e.g. result of first 8x32bit integer gather for 16bit components
943 // 256i - 0 1 2 3 4 5 6 7
944 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
947 // if we have at least one component out of x or y to fetch
948 if(info
.numComps
> 2)
950 // offset base to the next components(zw) in the vertex to gather
951 pSrcBase
= GEP(pSrcBase
, C((char)4));
954 vGatherResult
[1] = GATHERPS(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, C((char)1));
955 // e.g. result of second 8x32bit integer gather for 16bit components
956 // 256i - 0 1 2 3 4 5 6 7
957 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
962 vGatherResult
[1] = vGatherMaskedVal
;
965 // Shuffle gathered components into place, each row is a component
966 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
972 for (uint32_t i
= 0; i
< 4; ++i
)
974 vGatherComponents
[i
] = VIMMED1(*(float*)&info
.defaults
[i
]);
977 for(uint32_t i
= 0; i
< info
.numComps
; i
++)
979 uint32_t swizzleIndex
= info
.swizzle
[i
];
981 // save mask as it is zero'd out after each gather
984 // Gather a SIMD of components
985 vGatherComponents
[swizzleIndex
] = GATHERPS(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, C((char)1));
987 // offset base to the next component to gather
988 pSrcBase
= GEP(pSrcBase
, C((char)4));
993 SWR_ASSERT(0, "Invalid float format");
998 void Builder::GATHER4DD(const SWR_FORMAT_INFO
&info
, Value
* pSrcBase
, Value
* byteOffsets
,
999 Value
* mask
, Value
* vGatherComponents
[], bool bPackedOutput
)
1001 switch (info
.bpp
/ info
.numComps
)
1005 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
1006 Value
* vGatherResult
= GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, mask
, C((char)1));
1007 // e.g. result of an 8x32bit integer gather for 8bit components
1008 // 256i - 0 1 2 3 4 5 6 7
1009 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1011 Shuffle8bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
1016 Value
* vGatherResult
[2];
1019 // TODO: vGatherMaskedVal
1020 Value
* vGatherMaskedVal
= VIMMED1((int32_t)0);
1022 // always have at least one component out of x or y to fetch
1024 // save mask as it is zero'd out after each gather
1027 vGatherResult
[0] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, C((char)1));
1028 // e.g. result of first 8x32bit integer gather for 16bit components
1029 // 256i - 0 1 2 3 4 5 6 7
1030 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1033 // if we have at least one component out of x or y to fetch
1034 if(info
.numComps
> 2)
1036 // offset base to the next components(zw) in the vertex to gather
1037 pSrcBase
= GEP(pSrcBase
, C((char)4));
1040 vGatherResult
[1] = GATHERDD(vGatherMaskedVal
, pSrcBase
, byteOffsets
, vMask
, C((char)1));
1041 // e.g. result of second 8x32bit integer gather for 16bit components
1042 // 256i - 0 1 2 3 4 5 6 7
1043 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1048 vGatherResult
[1] = vGatherMaskedVal
;
1051 // Shuffle gathered components into place, each row is a component
1052 Shuffle16bpcGather4(info
, vGatherResult
, vGatherComponents
, bPackedOutput
);
1059 for (uint32_t i
= 0; i
< 4; ++i
)
1061 vGatherComponents
[i
] = VIMMED1((int)info
.defaults
[i
]);
1064 for(uint32_t i
= 0; i
< info
.numComps
; i
++)
1066 uint32_t swizzleIndex
= info
.swizzle
[i
];
1068 // save mask as it is zero'd out after each gather
1069 Value
*vMask
= mask
;
1071 // Gather a SIMD of components
1072 vGatherComponents
[swizzleIndex
] = GATHERDD(vGatherComponents
[swizzleIndex
], pSrcBase
, byteOffsets
, vMask
, C((char)1));
1074 // offset base to the next component to gather
1075 pSrcBase
= GEP(pSrcBase
, C((char)4));
1080 SWR_ASSERT(0, "unsupported format");
1085 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
[2], Value
* vGatherOutput
[4], bool bPackedOutput
)
1088 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), JM()->mVWidth
);
1089 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, JM()->mVWidth
* 4); // vwidth is units of 32 bits
1091 // input could either be float or int vector; do shuffle work in int
1092 vGatherInput
[0] = BITCAST(vGatherInput
[0], mSimdInt32Ty
);
1093 vGatherInput
[1] = BITCAST(vGatherInput
[1], mSimdInt32Ty
);
1097 Type
* v128bitTy
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), JM()->mVWidth
/ 4); // vwidth is units of 32 bits
1100 Value
* vConstMask
= C
<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1101 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1102 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[0], v32x8Ty
), vConstMask
), vGatherTy
);
1103 // after pshufb: group components together in each 128bit lane
1104 // 256i - 0 1 2 3 4 5 6 7
1105 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1107 Value
* vi128XY
= BITCAST(PERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
1108 // after PERMD: move and pack xy components into each 128bit lane
1109 // 256i - 0 1 2 3 4 5 6 7
1110 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1112 // do the same for zw components
1113 Value
* vi128ZW
= nullptr;
1114 if(info
.numComps
> 2)
1116 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
[1], v32x8Ty
), vConstMask
), vGatherTy
);
1117 vi128ZW
= BITCAST(PERMD(vShufResult
, C
<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy
);
1120 for(uint32_t i
= 0; i
< 4; i
++)
1122 uint32_t swizzleIndex
= info
.swizzle
[i
];
1123 // todo: fixed for packed
1124 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
1125 if(i
>= info
.numComps
)
1127 // set the default component val
1128 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
1132 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1133 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1134 // if x or y, use vi128XY permute result, else use vi128ZW
1135 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
1137 // extract packed component 128 bit lanes
1138 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
1144 // pshufb masks for each component
1145 Value
* vConstMask
[2];
1147 vConstMask
[0] = C
<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1148 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1151 vConstMask
[1] = C
<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1152 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1155 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1157 for (uint32_t i
= 0; i
< 4; ++i
)
1159 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
1162 for(uint32_t i
= 0; i
< info
.numComps
; i
++)
1164 uint32_t swizzleIndex
= info
.swizzle
[i
];
1166 // select correct constMask for x/z or y/w pshufb
1167 uint32_t selectedMask
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1168 // if x or y, use vi128XY permute result, else use vi128ZW
1169 uint32_t selectedGather
= (i
< 2) ? 0 : 1;
1171 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
[selectedGather
], v32x8Ty
), vConstMask
[selectedMask
]), vGatherTy
);
1172 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1173 // 256i - 0 1 2 3 4 5 6 7
1174 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1179 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO
&info
, Value
* vGatherInput
, Value
* vGatherOutput
[], bool bPackedOutput
)
1182 Type
* vGatherTy
= VectorType::get(IntegerType::getInt32Ty(JM()->mContext
), JM()->mVWidth
);
1183 Type
* v32x8Ty
= VectorType::get(mInt8Ty
, JM()->mVWidth
* 4 ); // vwidth is units of 32 bits
1187 Type
* v128Ty
= VectorType::get(IntegerType::getIntNTy(JM()->mContext
, 128), JM()->mVWidth
/ 4); // vwidth is units of 32 bits
1189 Value
* vConstMask
= C
<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1190 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1191 Value
* vShufResult
= BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
1192 // after pshufb: group components together in each 128bit lane
1193 // 256i - 0 1 2 3 4 5 6 7
1194 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1196 Value
* vi128XY
= BITCAST(PERMD(vShufResult
, C
<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty
);
1197 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1198 // 256i - 0 1 2 3 4 5 6 7
1199 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1201 // do the same for zw components
1202 Value
* vi128ZW
= nullptr;
1203 if(info
.numComps
> 2)
1205 vi128ZW
= BITCAST(PERMD(vShufResult
, C
<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty
);
1208 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1209 for(uint32_t i
= 0; i
< 4; i
++)
1211 uint32_t swizzleIndex
= info
.swizzle
[i
];
1212 // todo: fix for packed
1213 Value
* vGatherMaskedVal
= VIMMED1((int32_t)(info
.defaults
[i
]));
1214 if(i
>= info
.numComps
)
1216 // set the default component val
1217 vGatherOutput
[swizzleIndex
] = vGatherMaskedVal
;
1221 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1222 uint32_t lane
= ((i
== 0) || (i
== 2)) ? 0 : 1;
1223 // if x or y, use vi128XY permute result, else use vi128ZW
1224 Value
* selectedPermute
= (i
< 2) ? vi128XY
: vi128ZW
;
1227 vGatherOutput
[swizzleIndex
] = VEXTRACT(selectedPermute
, C(lane
));
1232 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1234 for (uint32_t i
= 0; i
< 4; ++i
)
1236 vGatherOutput
[i
] = VIMMED1((int32_t)info
.defaults
[i
]);
1239 for(uint32_t i
= 0; i
< info
.numComps
; i
++){
1240 uint32_t swizzleIndex
= info
.swizzle
[i
];
1242 // pshufb masks for each component
1248 vConstMask
= C
<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1249 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1253 vConstMask
= C
<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1254 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1258 vConstMask
= C
<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1259 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1263 vConstMask
= C
<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1264 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1267 vConstMask
= nullptr;
1271 vGatherOutput
[swizzleIndex
] = BITCAST(PSHUFB(BITCAST(vGatherInput
, v32x8Ty
), vConstMask
), vGatherTy
);
1272 // after pshufb for x channel
1273 // 256i - 0 1 2 3 4 5 6 7
1274 // x000 x000 x000 x000 x000 x000 x000 x000
1279 //////////////////////////////////////////////////////////////////////////
1280 /// @brief emulates a scatter operation.
1281 /// @param pDst - pointer to destination
1282 /// @param vSrc - vector of src data to scatter
1283 /// @param vOffsets - vector of byte offsets from pDst
1284 /// @param vMask - mask of valid lanes
1285 void Builder::SCATTERPS(Value
* pDst
, Value
* vSrc
, Value
* vOffsets
, Value
* vMask
)
1287 Value
* pStack
= STACKSAVE();
1289 Type
* pSrcTy
= vSrc
->getType()->getVectorElementType();
1291 // allocate tmp stack for masked off lanes
1292 Value
* vTmpPtr
= ALLOCA(pSrcTy
);
1294 Value
*mask
= MASK(vMask
);
1295 for (uint32_t i
= 0; i
< JM()->mVWidth
; ++i
)
1297 Value
*offset
= VEXTRACT(vOffsets
, C(i
));
1298 // byte pointer to component
1299 Value
*storeAddress
= GEP(pDst
, offset
);
1300 storeAddress
= BITCAST(storeAddress
, PointerType::get(pSrcTy
, 0));
1301 Value
*selMask
= VEXTRACT(mask
, C(i
));
1302 Value
*srcElem
= VEXTRACT(vSrc
, C(i
));
1303 // switch in a safe address to load if we're trying to access a vertex
1304 Value
*validAddress
= SELECT(selMask
, storeAddress
, vTmpPtr
);
1305 STORE(srcElem
, validAddress
);
1308 STACKRESTORE(pStack
);
1311 Value
* Builder::VABSPS(Value
* a
)
1313 Value
* asInt
= BITCAST(a
, mSimdInt32Ty
);
1314 Value
* result
= BITCAST(AND(asInt
, VIMMED1(0x7fffffff)), mSimdFP32Ty
);
1318 Value
*Builder::ICLAMP(Value
* src
, Value
* low
, Value
* high
)
1320 Value
*lowCmp
= ICMP_SLT(src
, low
);
1321 Value
*ret
= SELECT(lowCmp
, low
, src
);
1323 Value
*highCmp
= ICMP_SGT(ret
, high
);
1324 ret
= SELECT(highCmp
, high
, ret
);
1329 Value
*Builder::FCLAMP(Value
* src
, Value
* low
, Value
* high
)
1331 Value
*lowCmp
= FCMP_OLT(src
, low
);
1332 Value
*ret
= SELECT(lowCmp
, low
, src
);
1334 Value
*highCmp
= FCMP_OGT(ret
, high
);
1335 ret
= SELECT(highCmp
, high
, ret
);
1340 Value
*Builder::FCLAMP(Value
* src
, float low
, float high
)
1342 Value
* result
= VMAXPS(src
, VIMMED1(low
));
1343 result
= VMINPS(result
, VIMMED1(high
));
1348 //////////////////////////////////////////////////////////////////////////
1349 /// @brief save/restore stack, providing ability to push/pop the stack and
1350 /// reduce overall stack requirements for temporary stack use
1351 Value
* Builder::STACKSAVE()
1353 Function
* pfnStackSave
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::stacksave
);
1354 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1355 return CALL(pfnStackSave
);
1357 return CALLA(pfnStackSave
);
1361 void Builder::STACKRESTORE(Value
* pSaved
)
1363 Function
* pfnStackRestore
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::stackrestore
);
1364 CALL(pfnStackRestore
, std::initializer_list
<Value
*>{pSaved
});
1367 Value
*Builder::FMADDPS(Value
* a
, Value
* b
, Value
* c
)
1370 // use FMADs if available
1371 if(JM()->mArch
.AVX2())
1373 vOut
= VFMADDPS(a
, b
, c
);
1377 vOut
= FADD(FMUL(a
, b
), c
);
1382 Value
* Builder::POPCNT(Value
* a
)
1384 Function
* pCtPop
= Intrinsic::getDeclaration(JM()->mpCurrentModule
, Intrinsic::ctpop
, { a
->getType() });
1385 return CALL(pCtPop
, std::initializer_list
<Value
*>{a
});
1388 //////////////////////////////////////////////////////////////////////////
1389 /// @brief C functions called by LLVM IR
1390 //////////////////////////////////////////////////////////////////////////
1392 //////////////////////////////////////////////////////////////////////////
1393 /// @brief called in JIT code, inserted by PRINT
1394 /// output to both stdout and visual studio debug console
1395 void __cdecl
CallPrint(const char* fmt
, ...)
1398 va_start(args
, fmt
);
1401 #if defined( _WIN32 )
1403 vsnprintf_s(strBuf
, _TRUNCATE
, fmt
, args
);
1404 OutputDebugString(strBuf
);
1408 Value
*Builder::VEXTRACTI128(Value
* a
, Constant
* imm8
)
1410 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1412 Intrinsic::getDeclaration(JM()->mpCurrentModule
,
1413 Intrinsic::x86_avx_vextractf128_si_256
);
1414 return CALL(func
, {a
, imm8
});
1416 bool flag
= !imm8
->isZeroValue();
1417 SmallVector
<Constant
*,8> idx
;
1418 for (unsigned i
= 0; i
< JM()->mVWidth
/ 2; i
++) {
1419 idx
.push_back(C(flag
? i
+ JM()->mVWidth
/ 2 : i
));
1421 return VSHUFFLE(a
, VUNDEF_I(), ConstantVector::get(idx
));
1425 Value
*Builder::VINSERTI128(Value
* a
, Value
* b
, Constant
* imm8
)
1427 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1429 Intrinsic::getDeclaration(JM()->mpCurrentModule
,
1430 Intrinsic::x86_avx_vinsertf128_si_256
);
1431 return CALL(func
, {a
, b
, imm8
});
1433 bool flag
= !imm8
->isZeroValue();
1434 SmallVector
<Constant
*,8> idx
;
1435 for (unsigned i
= 0; i
< JM()->mVWidth
; i
++) {
1436 idx
.push_back(C(i
));
1438 Value
*inter
= VSHUFFLE(b
, VUNDEF_I(), ConstantVector::get(idx
));
1440 SmallVector
<Constant
*,8> idx2
;
1441 for (unsigned i
= 0; i
< JM()->mVWidth
/ 2; i
++) {
1442 idx2
.push_back(C(flag
? i
: i
+ JM()->mVWidth
));
1444 for (unsigned i
= JM()->mVWidth
/ 2; i
< JM()->mVWidth
; i
++) {
1445 idx2
.push_back(C(flag
? i
+ JM()->mVWidth
/ 2 : i
));
1447 return VSHUFFLE(a
, inter
, ConstantVector::get(idx2
));