swr: [rasterizer] add support for llvm-3.9
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "common/rdtsc_buckets.h"
32
33 void __cdecl CallPrint(const char* fmt, ...);
34
35 //////////////////////////////////////////////////////////////////////////
36 /// @brief Convert an IEEE 754 32-bit single precision float to an
37 /// 16 bit float with 5 exponent bits and a variable
38 /// number of mantissa bits.
39 /// @param val - 32-bit float
40 /// @todo Maybe move this outside of this file into a header?
41 static uint16_t Convert32To16Float(float val)
42 {
43 uint32_t sign, exp, mant;
44 uint32_t roundBits;
45
46 // Extract the sign, exponent, and mantissa
47 uint32_t uf = *(uint32_t*)&val;
48 sign = (uf & 0x80000000) >> 31;
49 exp = (uf & 0x7F800000) >> 23;
50 mant = uf & 0x007FFFFF;
51
52 // Check for out of range
53 if (std::isnan(val))
54 {
55 exp = 0x1F;
56 mant = 0x200;
57 sign = 1; // set the sign bit for NANs
58 }
59 else if (std::isinf(val))
60 {
61 exp = 0x1f;
62 mant = 0x0;
63 }
64 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
65 {
66 exp = 0x1E;
67 mant = 0x3FF;
68 }
69 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
70 {
71 mant |= 0x00800000;
72 for (; exp <= 0x70; mant >>= 1, exp++)
73 ;
74 exp = 0;
75 mant = mant >> 13;
76 }
77 else if (exp < 0x66) // Too small to represent -> Zero
78 {
79 exp = 0;
80 mant = 0;
81 }
82 else
83 {
84 // Saves bits that will be shifted off for rounding
85 roundBits = mant & 0x1FFFu;
86 // convert exponent and mantissa to 16 bit format
87 exp = exp - 0x70;
88 mant = mant >> 13;
89
90 // Essentially RTZ, but round up if off by only 1 lsb
91 if (roundBits == 0x1FFFu)
92 {
93 mant++;
94 // check for overflow
95 if ((mant & 0xC00u) != 0)
96 exp++;
97 // make sure only the needed bits are used
98 mant &= 0x3FF;
99 }
100 }
101
102 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
103 return (uint16_t)tmpVal;
104 }
105
106 //////////////////////////////////////////////////////////////////////////
107 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
108 /// float
109 /// @param val - 16-bit float
110 /// @todo Maybe move this outside of this file into a header?
111 static float ConvertSmallFloatTo32(UINT val)
112 {
113 UINT result;
114 if ((val & 0x7fff) == 0)
115 {
116 result = ((uint32_t)(val & 0x8000)) << 16;
117 }
118 else if ((val & 0x7c00) == 0x7c00)
119 {
120 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
121 result |= ((uint32_t)val & 0x8000) << 16;
122 }
123 else
124 {
125 uint32_t sign = (val & 0x8000) << 16;
126 uint32_t mant = (val & 0x3ff) << 13;
127 uint32_t exp = (val >> 10) & 0x1f;
128 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
129 {
130 mant <<= 1;
131 while (mant < (0x400 << 13))
132 {
133 exp--;
134 mant <<= 1;
135 }
136 mant &= (0x3ff << 13);
137 }
138 exp = ((exp - 15 + 127) & 0xff) << 23;
139 result = sign | exp | mant;
140 }
141
142 return *(float*)&result;
143 }
144
145 Constant *Builder::C(bool i)
146 {
147 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
148 }
149
150 Constant *Builder::C(char i)
151 {
152 return ConstantInt::get(IRB()->getInt8Ty(), i);
153 }
154
155 Constant *Builder::C(uint8_t i)
156 {
157 return ConstantInt::get(IRB()->getInt8Ty(), i);
158 }
159
160 Constant *Builder::C(int i)
161 {
162 return ConstantInt::get(IRB()->getInt32Ty(), i);
163 }
164
165 Constant *Builder::C(int64_t i)
166 {
167 return ConstantInt::get(IRB()->getInt64Ty(), i);
168 }
169
170 Constant *Builder::C(uint16_t i)
171 {
172 return ConstantInt::get(mInt16Ty,i);
173 }
174
175 Constant *Builder::C(uint32_t i)
176 {
177 return ConstantInt::get(IRB()->getInt32Ty(), i);
178 }
179
180 Constant *Builder::C(float i)
181 {
182 return ConstantFP::get(IRB()->getFloatTy(), i);
183 }
184
185 Constant *Builder::PRED(bool pred)
186 {
187 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
188 }
189
190 Value *Builder::VIMMED1(int i)
191 {
192 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
193 }
194
195 Value *Builder::VIMMED1(uint32_t i)
196 {
197 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
198 }
199
200 Value *Builder::VIMMED1(float i)
201 {
202 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
203 }
204
205 Value *Builder::VIMMED1(bool i)
206 {
207 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
208 }
209
210 Value *Builder::VUNDEF_IPTR()
211 {
212 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
213 }
214
215 Value *Builder::VUNDEF_I()
216 {
217 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
218 }
219
220 Value *Builder::VUNDEF(Type *ty, uint32_t size)
221 {
222 return UndefValue::get(VectorType::get(ty, size));
223 }
224
225 Value *Builder::VUNDEF_F()
226 {
227 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
228 }
229
230 Value *Builder::VUNDEF(Type* t)
231 {
232 return UndefValue::get(VectorType::get(t, mVWidth));
233 }
234
235 #if HAVE_LLVM == 0x306
236 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
237 {
238 return VINSERT(vec, val, C((int64_t)index));
239 }
240 #endif
241
242 Value *Builder::VBROADCAST(Value *src)
243 {
244 // check if src is already a vector
245 if (src->getType()->isVectorTy())
246 {
247 return src;
248 }
249
250 return VECTOR_SPLAT(mVWidth, src);
251 }
252
253 uint32_t Builder::IMMED(Value* v)
254 {
255 SWR_ASSERT(isa<ConstantInt>(v));
256 ConstantInt *pValConst = cast<ConstantInt>(v);
257 return pValConst->getZExtValue();
258 }
259
260 int32_t Builder::S_IMMED(Value* v)
261 {
262 SWR_ASSERT(isa<ConstantInt>(v));
263 ConstantInt *pValConst = cast<ConstantInt>(v);
264 return pValConst->getSExtValue();
265 }
266
267 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
268 {
269 std::vector<Value*> indices;
270 for (auto i : indexList)
271 indices.push_back(i);
272 return GEPA(ptr, indices);
273 }
274
275 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
276 {
277 std::vector<Value*> indices;
278 for (auto i : indexList)
279 indices.push_back(C(i));
280 return GEPA(ptr, indices);
281 }
282
283 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
284 {
285 std::vector<Value*> valIndices;
286 for (auto i : indices)
287 valIndices.push_back(C(i));
288 return LOAD(GEPA(basePtr, valIndices), name);
289 }
290
291 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
292 {
293 std::vector<Value*> valIndices;
294 for (auto i : indices)
295 valIndices.push_back(i);
296 return LOAD(GEPA(basePtr, valIndices), name);
297 }
298
299 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
300 {
301 std::vector<Value*> valIndices;
302 for (auto i : indices)
303 valIndices.push_back(C(i));
304 return STORE(val, GEPA(basePtr, valIndices));
305 }
306
307 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
308 {
309 std::vector<Value*> valIndices;
310 for (auto i : indices)
311 valIndices.push_back(i);
312 return STORE(val, GEPA(basePtr, valIndices));
313 }
314
315 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
316 {
317 std::vector<Value*> args;
318 for (auto arg : argsList)
319 args.push_back(arg);
320 return CALLA(Callee, args);
321 }
322
323 #if HAVE_LLVM > 0x306
324 CallInst *Builder::CALL(Value *Callee, Value* arg)
325 {
326 std::vector<Value*> args;
327 args.push_back(arg);
328 return CALLA(Callee, args);
329 }
330
331 CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
332 {
333 std::vector<Value*> args;
334 args.push_back(arg1);
335 args.push_back(arg2);
336 return CALLA(Callee, args);
337 }
338
339 CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
340 {
341 std::vector<Value*> args;
342 args.push_back(arg1);
343 args.push_back(arg2);
344 args.push_back(arg3);
345 return CALLA(Callee, args);
346 }
347 #endif
348
349 Value *Builder::VRCP(Value *va)
350 {
351 return FDIV(VIMMED1(1.0f), va); // 1 / a
352 }
353
354 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
355 {
356 Value* vOut = FMADDPS(vA, vX, vC);
357 vOut = FMADDPS(vB, vY, vOut);
358 return vOut;
359 }
360
361 //////////////////////////////////////////////////////////////////////////
362 /// @brief Generate an i32 masked load operation in LLVM IR. If not
363 /// supported on the underlying platform, emulate it with float masked load
364 /// @param src - base address pointer for the load
365 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
366 Value *Builder::MASKLOADD(Value* src,Value* mask)
367 {
368 Value* vResult;
369 // use avx2 gather instruction is available
370 if(JM()->mArch.AVX2())
371 {
372 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
373 vResult = CALL(func,{src,mask});
374 }
375 else
376 {
377 // maskload intrinsic expects integer mask operand in llvm >= 3.8
378 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
379 mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
380 #else
381 mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
382 #endif
383 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
384 vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
385 }
386 return vResult;
387 }
388
389 //////////////////////////////////////////////////////////////////////////
390 /// @brief insert a JIT call to CallPrint
391 /// - outputs formatted string to both stdout and VS output window
392 /// - DEBUG builds only
393 /// Usage example:
394 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
395 /// where C(lane) creates a constant value to print, and pIndex is the Value*
396 /// result from a GEP, printing out the pointer to memory
397 /// @param printStr - constant string to print, which includes format specifiers
398 /// @param printArgs - initializer list of Value*'s to print to std out
399 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
400 {
401 // push the arguments to CallPrint into a vector
402 std::vector<Value*> printCallArgs;
403 // save room for the format string. we still need to modify it for vectors
404 printCallArgs.resize(1);
405
406 // search through the format string for special processing
407 size_t pos = 0;
408 std::string tempStr(printStr);
409 pos = tempStr.find('%', pos);
410 auto v = printArgs.begin();
411
412 while ((pos != std::string::npos) && (v != printArgs.end()))
413 {
414 Value* pArg = *v;
415 Type* pType = pArg->getType();
416
417 if (pType->isVectorTy())
418 {
419 Type* pContainedType = pType->getContainedType(0);
420
421 if (toupper(tempStr[pos + 1]) == 'X')
422 {
423 tempStr[pos] = '0';
424 tempStr[pos + 1] = 'x';
425 tempStr.insert(pos + 2, "%08X ");
426 pos += 7;
427
428 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
429
430 std::string vectorFormatStr;
431 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
432 {
433 vectorFormatStr += "0x%08X ";
434 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
435 }
436
437 tempStr.insert(pos, vectorFormatStr);
438 pos += vectorFormatStr.size();
439 }
440 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
441 {
442 uint32_t i = 0;
443 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
444 {
445 tempStr.insert(pos, std::string("%f "));
446 pos += 3;
447 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
448 }
449 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
450 }
451 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
452 {
453 uint32_t i = 0;
454 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
455 {
456 tempStr.insert(pos, std::string("%d "));
457 pos += 3;
458 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
459 }
460 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
461 }
462 }
463 else
464 {
465 if (toupper(tempStr[pos + 1]) == 'X')
466 {
467 tempStr[pos] = '0';
468 tempStr.insert(pos + 1, "x%08");
469 printCallArgs.push_back(pArg);
470 pos += 3;
471 }
472 // for %f we need to cast float Values to doubles so that they print out correctly
473 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
474 {
475 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
476 pos++;
477 }
478 else
479 {
480 printCallArgs.push_back(pArg);
481 }
482 }
483
484 // advance to the next arguement
485 v++;
486 pos = tempStr.find('%', ++pos);
487 }
488
489 // create global variable constant string
490 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
491 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
492 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
493
494 // get a pointer to the first character in the constant string array
495 std::vector<Constant*> geplist{C(0),C(0)};
496 #if HAVE_LLVM == 0x306
497 Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
498 #else
499 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
500 #endif
501
502 // insert the pointer to the format string in the argument vector
503 printCallArgs[0] = strGEP;
504
505 // get pointer to CallPrint function and insert decl into the module if needed
506 std::vector<Type*> args;
507 args.push_back(PointerType::get(mInt8Ty,0));
508 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
509 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
510
511 // if we haven't yet added the symbol to the symbol table
512 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
513 {
514 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
515 }
516
517 // insert a call to CallPrint
518 return CALLA(callPrintFn,printCallArgs);
519 }
520
521 //////////////////////////////////////////////////////////////////////////
522 /// @brief Wrapper around PRINT with initializer list.
523 CallInst* Builder::PRINT(const std::string &printStr)
524 {
525 return PRINT(printStr, {});
526 }
527
528 //////////////////////////////////////////////////////////////////////////
529 /// @brief Generate a masked gather operation in LLVM IR. If not
530 /// supported on the underlying platform, emulate it with loads
531 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
532 /// @param pBase - Int8* base VB address pointer value
533 /// @param vIndices - SIMD wide value of VB byte offsets
534 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
535 /// @param scale - value to scale indices by
536 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
537 {
538 Value* vGather;
539
540 // use avx2 gather instruction if available
541 if(JM()->mArch.AVX2())
542 {
543 // force mask to <N x float>, required by vgather
544 vMask = BITCAST(vMask, mSimdFP32Ty);
545 vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
546 }
547 else
548 {
549 Value* pStack = STACKSAVE();
550
551 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
552 Value* vSrcPtr = ALLOCA(vSrc->getType());
553 STORE(vSrc, vSrcPtr);
554
555 vGather = VUNDEF_F();
556 Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
557 Value *vOffsets = MUL(vIndices,vScaleVec);
558 Value *mask = MASK(vMask);
559 for(uint32_t i = 0; i < mVWidth; ++i)
560 {
561 // single component byte index
562 Value *offset = VEXTRACT(vOffsets,C(i));
563 // byte pointer to component
564 Value *loadAddress = GEP(pBase,offset);
565 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
566 // pointer to the value to load if we're masking off a component
567 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
568 Value *selMask = VEXTRACT(mask,C(i));
569 // switch in a safe address to load if we're trying to access a vertex
570 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
571 Value *val = LOAD(validAddress);
572 vGather = VINSERT(vGather,val,C(i));
573 }
574 STACKRESTORE(pStack);
575 }
576
577 return vGather;
578 }
579
580 //////////////////////////////////////////////////////////////////////////
581 /// @brief Generate a masked gather operation in LLVM IR. If not
582 /// supported on the underlying platform, emulate it with loads
583 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
584 /// @param pBase - Int8* base VB address pointer value
585 /// @param vIndices - SIMD wide value of VB byte offsets
586 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
587 /// @param scale - value to scale indices by
588 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
589 {
590 Value* vGather;
591
592 // use avx2 gather instruction if available
593 if(JM()->mArch.AVX2())
594 {
595 vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
596 }
597 else
598 {
599 Value* pStack = STACKSAVE();
600
601 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
602 Value* vSrcPtr = ALLOCA(vSrc->getType());
603 STORE(vSrc, vSrcPtr);
604
605 vGather = VUNDEF_I();
606 Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
607 Value *vOffsets = MUL(vIndices, vScaleVec);
608 Value *mask = MASK(vMask);
609 for(uint32_t i = 0; i < mVWidth; ++i)
610 {
611 // single component byte index
612 Value *offset = VEXTRACT(vOffsets, C(i));
613 // byte pointer to component
614 Value *loadAddress = GEP(pBase, offset);
615 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
616 // pointer to the value to load if we're masking off a component
617 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
618 Value *selMask = VEXTRACT(mask, C(i));
619 // switch in a safe address to load if we're trying to access a vertex
620 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
621 Value *val = LOAD(validAddress, C(0));
622 vGather = VINSERT(vGather, val, C(i));
623 }
624
625 STACKRESTORE(pStack);
626 }
627 return vGather;
628 }
629
630 //////////////////////////////////////////////////////////////////////////
631 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
632 Value* Builder::MASK(Value* vmask)
633 {
634 Value* src = BITCAST(vmask, mSimdInt32Ty);
635 return ICMP_SLT(src, VIMMED1(0));
636 }
637
638 //////////////////////////////////////////////////////////////////////////
639 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
640 Value* Builder::VMASK(Value* mask)
641 {
642 return S_EXT(mask, mSimdInt32Ty);
643 }
644
645 //////////////////////////////////////////////////////////////////////////
646 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
647 /// supported on the underlying platform, emulate it
648 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
649 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
650 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
651 /// 128bits of a, and vice versa for the upper lanes. If the mask
652 /// value is negative, '0' is inserted.
653 Value *Builder::PSHUFB(Value* a, Value* b)
654 {
655 Value* res;
656 // use avx2 pshufb instruction if available
657 if(JM()->mArch.AVX2())
658 {
659 res = VPSHUFB(a, b);
660 }
661 else
662 {
663 Constant* cB = dyn_cast<Constant>(b);
664 // number of 8 bit elements in b
665 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
666 // output vector
667 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
668
669 // insert an 8 bit value from the high and low lanes of a per loop iteration
670 numElms /= 2;
671 for(uint32_t i = 0; i < numElms; i++)
672 {
673 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
674 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
675
676 // extract values from constant mask
677 char valLow128bLane = (char)(cLow128b->getSExtValue());
678 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
679
680 Value* insertValLow128b;
681 Value* insertValHigh128b;
682
683 // if the mask value is negative, insert a '0' in the respective output position
684 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
685 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
686 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
687
688 vShuf = VINSERT(vShuf, insertValLow128b, i);
689 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
690 }
691 res = vShuf;
692 }
693 return res;
694 }
695
696 //////////////////////////////////////////////////////////////////////////
697 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
698 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
699 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
700 /// lower 8 values are used.
701 Value *Builder::PMOVSXBD(Value* a)
702 {
703 // llvm-3.9 removed the pmovsxbd intrinsic
704 #if HAVE_LLVM < 0x309
705 // use avx2 byte sign extend instruction if available
706 if(JM()->mArch.AVX2())
707 {
708 Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
709 return CALL(pmovsxbd, std::initializer_list<Value*>{a});
710 }
711 else
712 #endif
713 {
714 // VPMOVSXBD output type
715 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
716 // Extract 8 values from 128bit lane and sign extend
717 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
718 }
719 }
720
721 //////////////////////////////////////////////////////////////////////////
722 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
723 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
724 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
725 Value *Builder::PMOVSXWD(Value* a)
726 {
727 // llvm-3.9 removed the pmovsxwd intrinsic
728 #if HAVE_LLVM < 0x309
729 // use avx2 word sign extend if available
730 if(JM()->mArch.AVX2())
731 {
732 Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
733 return CALL(pmovsxwd, std::initializer_list<Value*>{a});
734 }
735 else
736 #endif
737 {
738 // VPMOVSXWD output type
739 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
740 // Extract 8 values from 128bit lane and sign extend
741 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
742 }
743 }
744
745 //////////////////////////////////////////////////////////////////////////
746 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
747 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
748 /// platform, emulate it
749 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
750 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
751 Value *Builder::PERMD(Value* a, Value* idx)
752 {
753 Value* res;
754 // use avx2 permute instruction if available
755 if(JM()->mArch.AVX2())
756 {
757 res = VPERMD(a, idx);
758 }
759 else
760 {
761 if (isa<Constant>(idx))
762 {
763 res = VSHUFFLE(a, a, idx);
764 }
765 else
766 {
767 res = VUNDEF_I();
768 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
769 {
770 Value* pIndex = VEXTRACT(idx, C(l));
771 Value* pVal = VEXTRACT(a, pIndex);
772 res = VINSERT(res, pVal, C(l));
773 }
774 }
775 }
776 return res;
777 }
778
779 //////////////////////////////////////////////////////////////////////////
780 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
781 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
782 /// platform, emulate it
783 /// @param a - 256bit SIMD lane(8x32bit) of float values.
784 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
785 Value *Builder::PERMPS(Value* a, Value* idx)
786 {
787 Value* res;
788 // use avx2 permute instruction if available
789 if (JM()->mArch.AVX2())
790 {
791 // llvm 3.6.0 swapped the order of the args to vpermd
792 res = VPERMPS(idx, a);
793 }
794 else
795 {
796 if (isa<Constant>(idx))
797 {
798 res = VSHUFFLE(a, a, idx);
799 }
800 else
801 {
802 res = VUNDEF_F();
803 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
804 {
805 Value* pIndex = VEXTRACT(idx, C(l));
806 Value* pVal = VEXTRACT(a, pIndex);
807 res = VINSERT(res, pVal, C(l));
808 }
809 }
810 }
811
812 return res;
813 }
814
815 //////////////////////////////////////////////////////////////////////////
816 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
817 /// in LLVM IR. If not supported on the underlying platform, emulate it
818 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
819 Value *Builder::CVTPH2PS(Value* a)
820 {
821 if (JM()->mArch.F16C())
822 {
823 return VCVTPH2PS(a);
824 }
825 else
826 {
827 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
828 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
829
830 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
831 {
832 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
833 }
834
835 Value* pResult = UndefValue::get(mSimdFP32Ty);
836 for (uint32_t i = 0; i < mVWidth; ++i)
837 {
838 Value* pSrc = VEXTRACT(a, C(i));
839 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
840 pResult = VINSERT(pResult, pConv, C(i));
841 }
842
843 return pResult;
844 }
845 }
846
847 //////////////////////////////////////////////////////////////////////////
848 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
849 /// in LLVM IR. If not supported on the underlying platform, emulate it
850 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
851 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
852 {
853 if (JM()->mArch.F16C())
854 {
855 return VCVTPS2PH(a, rounding);
856 }
857 else
858 {
859 // call scalar C function for now
860 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
861 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
862
863 if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
864 {
865 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
866 }
867
868 Value* pResult = UndefValue::get(mSimdInt16Ty);
869 for (uint32_t i = 0; i < mVWidth; ++i)
870 {
871 Value* pSrc = VEXTRACT(a, C(i));
872 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
873 pResult = VINSERT(pResult, pConv, C(i));
874 }
875
876 return pResult;
877 }
878 }
879
880 Value *Builder::PMAXSD(Value* a, Value* b)
881 {
882 // llvm-3.9 removed the pmax intrinsics
883 #if HAVE_LLVM >= 0x309
884 Value* cmp = ICMP_SGT(a, b);
885 return SELECT(cmp, a, b);
886 #else
887 if (JM()->mArch.AVX2())
888 {
889 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
890 return CALL(pmaxsd, {a, b});
891 }
892 else
893 {
894 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
895 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
896
897 // low 128
898 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
899 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
900 Value* resLo = CALL(pmaxsd, {aLo, bLo});
901
902 // high 128
903 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
904 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
905 Value* resHi = CALL(pmaxsd, {aHi, bHi});
906
907 // combine
908 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
909 result = VINSERTI128(result, resHi, C((uint8_t)1));
910
911 return result;
912 }
913 #endif
914 }
915
916 Value *Builder::PMINSD(Value* a, Value* b)
917 {
918 // llvm-3.9 removed the pmin intrinsics
919 #if HAVE_LLVM >= 0x309
920 Value* cmp = ICMP_SLT(a, b);
921 return SELECT(cmp, a, b);
922 #else
923 if (JM()->mArch.AVX2())
924 {
925 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
926 return CALL(pminsd, {a, b});
927 }
928 else
929 {
930 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
931 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
932
933 // low 128
934 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
935 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
936 Value* resLo = CALL(pminsd, {aLo, bLo});
937
938 // high 128
939 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
940 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
941 Value* resHi = CALL(pminsd, {aHi, bHi});
942
943 // combine
944 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
945 result = VINSERTI128(result, resHi, C((uint8_t)1));
946
947 return result;
948 }
949 #endif
950 }
951
952 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
953 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
954 {
955 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
956 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
957 {
958 // ensure our mask is the correct type
959 mask = BITCAST(mask, mSimdFP32Ty);
960 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
961 }
962 else
963 {
964 // ensure our mask is the correct type
965 mask = BITCAST(mask, mSimdInt32Ty);
966 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
967 }
968 }
969
970 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
971 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
972 {
973 switch(info.bpp / info.numComps)
974 {
975 case 16:
976 {
977 Value* vGatherResult[2];
978 Value *vMask;
979
980 // TODO: vGatherMaskedVal
981 Value* vGatherMaskedVal = VIMMED1((float)0);
982
983 // always have at least one component out of x or y to fetch
984
985 // save mask as it is zero'd out after each gather
986 vMask = mask;
987
988 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
989 // e.g. result of first 8x32bit integer gather for 16bit components
990 // 256i - 0 1 2 3 4 5 6 7
991 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
992 //
993
994 // if we have at least one component out of x or y to fetch
995 if(info.numComps > 2)
996 {
997 // offset base to the next components(zw) in the vertex to gather
998 pSrcBase = GEP(pSrcBase, C((char)4));
999 vMask = mask;
1000
1001 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1002 // e.g. result of second 8x32bit integer gather for 16bit components
1003 // 256i - 0 1 2 3 4 5 6 7
1004 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1005 //
1006 }
1007 else
1008 {
1009 vGatherResult[1] = vGatherMaskedVal;
1010 }
1011
1012 // Shuffle gathered components into place, each row is a component
1013 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1014 }
1015 break;
1016 case 32:
1017 {
1018 // apply defaults
1019 for (uint32_t i = 0; i < 4; ++i)
1020 {
1021 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1022 }
1023
1024 for(uint32_t i = 0; i < info.numComps; i++)
1025 {
1026 uint32_t swizzleIndex = info.swizzle[i];
1027
1028 // save mask as it is zero'd out after each gather
1029 Value *vMask = mask;
1030
1031 // Gather a SIMD of components
1032 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1033
1034 // offset base to the next component to gather
1035 pSrcBase = GEP(pSrcBase, C((char)4));
1036 }
1037 }
1038 break;
1039 default:
1040 SWR_ASSERT(0, "Invalid float format");
1041 break;
1042 }
1043 }
1044
1045 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1046 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1047 {
1048 switch (info.bpp / info.numComps)
1049 {
1050 case 8:
1051 {
1052 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1053 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1054 // e.g. result of an 8x32bit integer gather for 8bit components
1055 // 256i - 0 1 2 3 4 5 6 7
1056 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1057
1058 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1059 }
1060 break;
1061 case 16:
1062 {
1063 Value* vGatherResult[2];
1064 Value *vMask;
1065
1066 // TODO: vGatherMaskedVal
1067 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1068
1069 // always have at least one component out of x or y to fetch
1070
1071 // save mask as it is zero'd out after each gather
1072 vMask = mask;
1073
1074 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1075 // e.g. result of first 8x32bit integer gather for 16bit components
1076 // 256i - 0 1 2 3 4 5 6 7
1077 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1078 //
1079
1080 // if we have at least one component out of x or y to fetch
1081 if(info.numComps > 2)
1082 {
1083 // offset base to the next components(zw) in the vertex to gather
1084 pSrcBase = GEP(pSrcBase, C((char)4));
1085 vMask = mask;
1086
1087 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1088 // e.g. result of second 8x32bit integer gather for 16bit components
1089 // 256i - 0 1 2 3 4 5 6 7
1090 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1091 //
1092 }
1093 else
1094 {
1095 vGatherResult[1] = vGatherMaskedVal;
1096 }
1097
1098 // Shuffle gathered components into place, each row is a component
1099 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1100
1101 }
1102 break;
1103 case 32:
1104 {
1105 // apply defaults
1106 for (uint32_t i = 0; i < 4; ++i)
1107 {
1108 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1109 }
1110
1111 for(uint32_t i = 0; i < info.numComps; i++)
1112 {
1113 uint32_t swizzleIndex = info.swizzle[i];
1114
1115 // save mask as it is zero'd out after each gather
1116 Value *vMask = mask;
1117
1118 // Gather a SIMD of components
1119 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1120
1121 // offset base to the next component to gather
1122 pSrcBase = GEP(pSrcBase, C((char)4));
1123 }
1124 }
1125 break;
1126 default:
1127 SWR_ASSERT(0, "unsupported format");
1128 break;
1129 }
1130 }
1131
1132 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1133 {
1134 // cast types
1135 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1136 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1137
1138 // input could either be float or int vector; do shuffle work in int
1139 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1140 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1141
1142 if(bPackedOutput)
1143 {
1144 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1145
1146 // shuffle mask
1147 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1148 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1149 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1150 // after pshufb: group components together in each 128bit lane
1151 // 256i - 0 1 2 3 4 5 6 7
1152 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1153
1154 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1155 // after PERMD: move and pack xy components into each 128bit lane
1156 // 256i - 0 1 2 3 4 5 6 7
1157 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1158
1159 // do the same for zw components
1160 Value* vi128ZW = nullptr;
1161 if(info.numComps > 2)
1162 {
1163 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1164 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1165 }
1166
1167 for(uint32_t i = 0; i < 4; i++)
1168 {
1169 uint32_t swizzleIndex = info.swizzle[i];
1170 // todo: fixed for packed
1171 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1172 if(i >= info.numComps)
1173 {
1174 // set the default component val
1175 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1176 continue;
1177 }
1178
1179 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1180 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1181 // if x or y, use vi128XY permute result, else use vi128ZW
1182 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1183
1184 // extract packed component 128 bit lanes
1185 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1186 }
1187
1188 }
1189 else
1190 {
1191 // pshufb masks for each component
1192 Value* vConstMask[2];
1193 // x/z shuffle mask
1194 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1195 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1196
1197 // y/w shuffle mask
1198 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1199 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1200
1201
1202 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1203 // apply defaults
1204 for (uint32_t i = 0; i < 4; ++i)
1205 {
1206 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1207 }
1208
1209 for(uint32_t i = 0; i < info.numComps; i++)
1210 {
1211 uint32_t swizzleIndex = info.swizzle[i];
1212
1213 // select correct constMask for x/z or y/w pshufb
1214 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1215 // if x or y, use vi128XY permute result, else use vi128ZW
1216 uint32_t selectedGather = (i < 2) ? 0 : 1;
1217
1218 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1219 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1220 // 256i - 0 1 2 3 4 5 6 7
1221 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1222 }
1223 }
1224 }
1225
1226 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1227 {
1228 // cast types
1229 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1230 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1231
1232 if(bPackedOutput)
1233 {
1234 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1235 // shuffle mask
1236 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1237 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1238 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1239 // after pshufb: group components together in each 128bit lane
1240 // 256i - 0 1 2 3 4 5 6 7
1241 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1242
1243 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1244 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1245 // 256i - 0 1 2 3 4 5 6 7
1246 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1247
1248 // do the same for zw components
1249 Value* vi128ZW = nullptr;
1250 if(info.numComps > 2)
1251 {
1252 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1253 }
1254
1255 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1256 for(uint32_t i = 0; i < 4; i++)
1257 {
1258 uint32_t swizzleIndex = info.swizzle[i];
1259 // todo: fix for packed
1260 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1261 if(i >= info.numComps)
1262 {
1263 // set the default component val
1264 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1265 continue;
1266 }
1267
1268 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1269 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1270 // if x or y, use vi128XY permute result, else use vi128ZW
1271 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1272
1273 // sign extend
1274 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1275 }
1276 }
1277 // else zero extend
1278 else{
1279 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1280 // apply defaults
1281 for (uint32_t i = 0; i < 4; ++i)
1282 {
1283 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1284 }
1285
1286 for(uint32_t i = 0; i < info.numComps; i++){
1287 uint32_t swizzleIndex = info.swizzle[i];
1288
1289 // pshufb masks for each component
1290 Value* vConstMask;
1291 switch(i)
1292 {
1293 case 0:
1294 // x shuffle mask
1295 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1296 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1297 break;
1298 case 1:
1299 // y shuffle mask
1300 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1301 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1302 break;
1303 case 2:
1304 // z shuffle mask
1305 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1306 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1307 break;
1308 case 3:
1309 // w shuffle mask
1310 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1311 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1312 break;
1313 default:
1314 vConstMask = nullptr;
1315 break;
1316 }
1317
1318 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1319 // after pshufb for x channel
1320 // 256i - 0 1 2 3 4 5 6 7
1321 // x000 x000 x000 x000 x000 x000 x000 x000
1322 }
1323 }
1324 }
1325
1326 //////////////////////////////////////////////////////////////////////////
1327 /// @brief emulates a scatter operation.
1328 /// @param pDst - pointer to destination
1329 /// @param vSrc - vector of src data to scatter
1330 /// @param vOffsets - vector of byte offsets from pDst
1331 /// @param vMask - mask of valid lanes
1332 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1333 {
1334 Value* pStack = STACKSAVE();
1335
1336 Type* pSrcTy = vSrc->getType()->getVectorElementType();
1337
1338 // allocate tmp stack for masked off lanes
1339 Value* vTmpPtr = ALLOCA(pSrcTy);
1340
1341 Value *mask = MASK(vMask);
1342 for (uint32_t i = 0; i < mVWidth; ++i)
1343 {
1344 Value *offset = VEXTRACT(vOffsets, C(i));
1345 // byte pointer to component
1346 Value *storeAddress = GEP(pDst, offset);
1347 storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
1348 Value *selMask = VEXTRACT(mask, C(i));
1349 Value *srcElem = VEXTRACT(vSrc, C(i));
1350 // switch in a safe address to load if we're trying to access a vertex
1351 Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
1352 STORE(srcElem, validAddress);
1353 }
1354
1355 STACKRESTORE(pStack);
1356 }
1357
1358 Value* Builder::VABSPS(Value* a)
1359 {
1360 Value* asInt = BITCAST(a, mSimdInt32Ty);
1361 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1362 return result;
1363 }
1364
1365 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1366 {
1367 Value *lowCmp = ICMP_SLT(src, low);
1368 Value *ret = SELECT(lowCmp, low, src);
1369
1370 Value *highCmp = ICMP_SGT(ret, high);
1371 ret = SELECT(highCmp, high, ret);
1372
1373 return ret;
1374 }
1375
1376 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1377 {
1378 Value *lowCmp = FCMP_OLT(src, low);
1379 Value *ret = SELECT(lowCmp, low, src);
1380
1381 Value *highCmp = FCMP_OGT(ret, high);
1382 ret = SELECT(highCmp, high, ret);
1383
1384 return ret;
1385 }
1386
1387 Value *Builder::FCLAMP(Value* src, float low, float high)
1388 {
1389 Value* result = VMAXPS(src, VIMMED1(low));
1390 result = VMINPS(result, VIMMED1(high));
1391
1392 return result;
1393 }
1394
1395 //////////////////////////////////////////////////////////////////////////
1396 /// @brief save/restore stack, providing ability to push/pop the stack and
1397 /// reduce overall stack requirements for temporary stack use
1398 Value* Builder::STACKSAVE()
1399 {
1400 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1401 #if HAVE_LLVM == 0x306
1402 return CALL(pfnStackSave);
1403 #else
1404 return CALLA(pfnStackSave);
1405 #endif
1406 }
1407
1408 void Builder::STACKRESTORE(Value* pSaved)
1409 {
1410 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1411 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1412 }
1413
1414 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1415 {
1416 Value* vOut;
1417 // use FMADs if available
1418 if(JM()->mArch.AVX2())
1419 {
1420 vOut = VFMADDPS(a, b, c);
1421 }
1422 else
1423 {
1424 vOut = FADD(FMUL(a, b), c);
1425 }
1426 return vOut;
1427 }
1428
1429 Value* Builder::POPCNT(Value* a)
1430 {
1431 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1432 return CALL(pCtPop, std::initializer_list<Value*>{a});
1433 }
1434
1435 //////////////////////////////////////////////////////////////////////////
1436 /// @brief C functions called by LLVM IR
1437 //////////////////////////////////////////////////////////////////////////
1438
1439 //////////////////////////////////////////////////////////////////////////
1440 /// @brief called in JIT code, inserted by PRINT
1441 /// output to both stdout and visual studio debug console
1442 void __cdecl CallPrint(const char* fmt, ...)
1443 {
1444 va_list args;
1445 va_start(args, fmt);
1446 vprintf(fmt, args);
1447
1448 #if defined( _WIN32 )
1449 char strBuf[1024];
1450 vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1451 OutputDebugString(strBuf);
1452 #endif
1453
1454 va_end(args);
1455 }
1456
1457 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1458 {
1459 #if HAVE_LLVM == 0x306
1460 Function *func =
1461 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1462 Intrinsic::x86_avx_vextractf128_si_256);
1463 return CALL(func, {a, imm8});
1464 #else
1465 bool flag = !imm8->isZeroValue();
1466 SmallVector<Constant*,8> idx;
1467 for (unsigned i = 0; i < mVWidth / 2; i++) {
1468 idx.push_back(C(flag ? i + mVWidth / 2 : i));
1469 }
1470 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1471 #endif
1472 }
1473
1474 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1475 {
1476 #if HAVE_LLVM == 0x306
1477 Function *func =
1478 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1479 Intrinsic::x86_avx_vinsertf128_si_256);
1480 return CALL(func, {a, b, imm8});
1481 #else
1482 bool flag = !imm8->isZeroValue();
1483 SmallVector<Constant*,8> idx;
1484 for (unsigned i = 0; i < mVWidth; i++) {
1485 idx.push_back(C(i));
1486 }
1487 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1488
1489 SmallVector<Constant*,8> idx2;
1490 for (unsigned i = 0; i < mVWidth / 2; i++) {
1491 idx2.push_back(C(flag ? i : i + mVWidth));
1492 }
1493 for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1494 idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1495 }
1496 return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1497 #endif
1498 }
1499
1500 // rdtsc buckets macros
1501 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1502 {
1503 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1504 // buckets framework when single threaded
1505 if (KNOB_SINGLE_THREADED)
1506 {
1507 std::vector<Type*> args{
1508 PointerType::get(mInt32Ty, 0), // pBucketMgr
1509 mInt32Ty // id
1510 };
1511
1512 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1513 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1514 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1515 {
1516 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1517 }
1518
1519 CALL(pFunc, { pBucketMgr, pId });
1520 }
1521 }
1522
1523 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1524 {
1525 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1526 // buckets framework when single threaded
1527 if (KNOB_SINGLE_THREADED)
1528 {
1529 std::vector<Type*> args{
1530 PointerType::get(mInt32Ty, 0), // pBucketMgr
1531 mInt32Ty // id
1532 };
1533
1534 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1535 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1536 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1537 {
1538 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1539 }
1540
1541 CALL(pFunc, { pBucketMgr, pId });
1542 }
1543 }
1544