swr: [rasterizer jitter] fix logic op to work with unorm/snorm
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "common/rdtsc_buckets.h"
32
33
34 namespace SwrJit
35 {
36 void __cdecl CallPrint(const char* fmt, ...);
37
38 //////////////////////////////////////////////////////////////////////////
39 /// @brief Convert an IEEE 754 32-bit single precision float to an
40 /// 16 bit float with 5 exponent bits and a variable
41 /// number of mantissa bits.
42 /// @param val - 32-bit float
43 /// @todo Maybe move this outside of this file into a header?
44 static uint16_t Convert32To16Float(float val)
45 {
46 uint32_t sign, exp, mant;
47 uint32_t roundBits;
48
49 // Extract the sign, exponent, and mantissa
50 uint32_t uf = *(uint32_t*)&val;
51 sign = (uf & 0x80000000) >> 31;
52 exp = (uf & 0x7F800000) >> 23;
53 mant = uf & 0x007FFFFF;
54
55 // Check for out of range
56 if (std::isnan(val))
57 {
58 exp = 0x1F;
59 mant = 0x200;
60 sign = 1; // set the sign bit for NANs
61 }
62 else if (std::isinf(val))
63 {
64 exp = 0x1f;
65 mant = 0x0;
66 }
67 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
68 {
69 exp = 0x1E;
70 mant = 0x3FF;
71 }
72 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
73 {
74 mant |= 0x00800000;
75 for (; exp <= 0x70; mant >>= 1, exp++)
76 ;
77 exp = 0;
78 mant = mant >> 13;
79 }
80 else if (exp < 0x66) // Too small to represent -> Zero
81 {
82 exp = 0;
83 mant = 0;
84 }
85 else
86 {
87 // Saves bits that will be shifted off for rounding
88 roundBits = mant & 0x1FFFu;
89 // convert exponent and mantissa to 16 bit format
90 exp = exp - 0x70;
91 mant = mant >> 13;
92
93 // Essentially RTZ, but round up if off by only 1 lsb
94 if (roundBits == 0x1FFFu)
95 {
96 mant++;
97 // check for overflow
98 if ((mant & 0xC00u) != 0)
99 exp++;
100 // make sure only the needed bits are used
101 mant &= 0x3FF;
102 }
103 }
104
105 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
106 return (uint16_t)tmpVal;
107 }
108
109 //////////////////////////////////////////////////////////////////////////
110 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
111 /// float
112 /// @param val - 16-bit float
113 /// @todo Maybe move this outside of this file into a header?
114 static float ConvertSmallFloatTo32(UINT val)
115 {
116 UINT result;
117 if ((val & 0x7fff) == 0)
118 {
119 result = ((uint32_t)(val & 0x8000)) << 16;
120 }
121 else if ((val & 0x7c00) == 0x7c00)
122 {
123 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
124 result |= ((uint32_t)val & 0x8000) << 16;
125 }
126 else
127 {
128 uint32_t sign = (val & 0x8000) << 16;
129 uint32_t mant = (val & 0x3ff) << 13;
130 uint32_t exp = (val >> 10) & 0x1f;
131 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
132 {
133 mant <<= 1;
134 while (mant < (0x400 << 13))
135 {
136 exp--;
137 mant <<= 1;
138 }
139 mant &= (0x3ff << 13);
140 }
141 exp = ((exp - 15 + 127) & 0xff) << 23;
142 result = sign | exp | mant;
143 }
144
145 return *(float*)&result;
146 }
147
148 Constant *Builder::C(bool i)
149 {
150 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
151 }
152
153 Constant *Builder::C(char i)
154 {
155 return ConstantInt::get(IRB()->getInt8Ty(), i);
156 }
157
158 Constant *Builder::C(uint8_t i)
159 {
160 return ConstantInt::get(IRB()->getInt8Ty(), i);
161 }
162
163 Constant *Builder::C(int i)
164 {
165 return ConstantInt::get(IRB()->getInt32Ty(), i);
166 }
167
168 Constant *Builder::C(int64_t i)
169 {
170 return ConstantInt::get(IRB()->getInt64Ty(), i);
171 }
172
173 Constant *Builder::C(uint16_t i)
174 {
175 return ConstantInt::get(mInt16Ty,i);
176 }
177
178 Constant *Builder::C(uint32_t i)
179 {
180 return ConstantInt::get(IRB()->getInt32Ty(), i);
181 }
182
183 Constant *Builder::C(float i)
184 {
185 return ConstantFP::get(IRB()->getFloatTy(), i);
186 }
187
188 Constant *Builder::PRED(bool pred)
189 {
190 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
191 }
192
193 Value *Builder::VIMMED1(int i)
194 {
195 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
196 }
197
198 Value *Builder::VIMMED1(uint32_t i)
199 {
200 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
201 }
202
203 Value *Builder::VIMMED1(float i)
204 {
205 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
206 }
207
208 Value *Builder::VIMMED1(bool i)
209 {
210 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
211 }
212
213 Value *Builder::VUNDEF_IPTR()
214 {
215 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
216 }
217
218 Value *Builder::VUNDEF_I()
219 {
220 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
221 }
222
223 Value *Builder::VUNDEF(Type *ty, uint32_t size)
224 {
225 return UndefValue::get(VectorType::get(ty, size));
226 }
227
228 Value *Builder::VUNDEF_F()
229 {
230 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
231 }
232
233 Value *Builder::VUNDEF(Type* t)
234 {
235 return UndefValue::get(VectorType::get(t, mVWidth));
236 }
237
238 #if HAVE_LLVM == 0x306
239 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
240 {
241 return VINSERT(vec, val, C((int64_t)index));
242 }
243 #endif
244
245 Value *Builder::VBROADCAST(Value *src)
246 {
247 // check if src is already a vector
248 if (src->getType()->isVectorTy())
249 {
250 return src;
251 }
252
253 return VECTOR_SPLAT(mVWidth, src);
254 }
255
256 uint32_t Builder::IMMED(Value* v)
257 {
258 SWR_ASSERT(isa<ConstantInt>(v));
259 ConstantInt *pValConst = cast<ConstantInt>(v);
260 return pValConst->getZExtValue();
261 }
262
263 int32_t Builder::S_IMMED(Value* v)
264 {
265 SWR_ASSERT(isa<ConstantInt>(v));
266 ConstantInt *pValConst = cast<ConstantInt>(v);
267 return pValConst->getSExtValue();
268 }
269
270 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
271 {
272 std::vector<Value*> indices;
273 for (auto i : indexList)
274 indices.push_back(i);
275 return GEPA(ptr, indices);
276 }
277
278 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
279 {
280 std::vector<Value*> indices;
281 for (auto i : indexList)
282 indices.push_back(C(i));
283 return GEPA(ptr, indices);
284 }
285
286 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
287 {
288 std::vector<Value*> valIndices;
289 for (auto i : indices)
290 valIndices.push_back(C(i));
291 return LOAD(GEPA(basePtr, valIndices), name);
292 }
293
294 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
295 {
296 std::vector<Value*> valIndices;
297 for (auto i : indices)
298 valIndices.push_back(i);
299 return LOAD(GEPA(basePtr, valIndices), name);
300 }
301
302 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
303 {
304 std::vector<Value*> valIndices;
305 for (auto i : indices)
306 valIndices.push_back(C(i));
307 return STORE(val, GEPA(basePtr, valIndices));
308 }
309
310 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
311 {
312 std::vector<Value*> valIndices;
313 for (auto i : indices)
314 valIndices.push_back(i);
315 return STORE(val, GEPA(basePtr, valIndices));
316 }
317
318 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
319 {
320 std::vector<Value*> args;
321 for (auto arg : argsList)
322 args.push_back(arg);
323 return CALLA(Callee, args);
324 }
325
326 #if HAVE_LLVM > 0x306
327 CallInst *Builder::CALL(Value *Callee, Value* arg)
328 {
329 std::vector<Value*> args;
330 args.push_back(arg);
331 return CALLA(Callee, args);
332 }
333
334 CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
335 {
336 std::vector<Value*> args;
337 args.push_back(arg1);
338 args.push_back(arg2);
339 return CALLA(Callee, args);
340 }
341
342 CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
343 {
344 std::vector<Value*> args;
345 args.push_back(arg1);
346 args.push_back(arg2);
347 args.push_back(arg3);
348 return CALLA(Callee, args);
349 }
350 #endif
351
352 Value *Builder::VRCP(Value *va)
353 {
354 return FDIV(VIMMED1(1.0f), va); // 1 / a
355 }
356
357 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
358 {
359 Value* vOut = FMADDPS(vA, vX, vC);
360 vOut = FMADDPS(vB, vY, vOut);
361 return vOut;
362 }
363
364 //////////////////////////////////////////////////////////////////////////
365 /// @brief Generate an i32 masked load operation in LLVM IR. If not
366 /// supported on the underlying platform, emulate it with float masked load
367 /// @param src - base address pointer for the load
368 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
369 Value *Builder::MASKLOADD(Value* src,Value* mask)
370 {
371 Value* vResult;
372 // use avx2 gather instruction is available
373 if(JM()->mArch.AVX2())
374 {
375 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
376 vResult = CALL(func,{src,mask});
377 }
378 else
379 {
380 // maskload intrinsic expects integer mask operand in llvm >= 3.8
381 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
382 mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
383 #else
384 mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
385 #endif
386 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
387 vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
388 }
389 return vResult;
390 }
391
392 //////////////////////////////////////////////////////////////////////////
393 /// @brief insert a JIT call to CallPrint
394 /// - outputs formatted string to both stdout and VS output window
395 /// - DEBUG builds only
396 /// Usage example:
397 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
398 /// where C(lane) creates a constant value to print, and pIndex is the Value*
399 /// result from a GEP, printing out the pointer to memory
400 /// @param printStr - constant string to print, which includes format specifiers
401 /// @param printArgs - initializer list of Value*'s to print to std out
402 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
403 {
404 // push the arguments to CallPrint into a vector
405 std::vector<Value*> printCallArgs;
406 // save room for the format string. we still need to modify it for vectors
407 printCallArgs.resize(1);
408
409 // search through the format string for special processing
410 size_t pos = 0;
411 std::string tempStr(printStr);
412 pos = tempStr.find('%', pos);
413 auto v = printArgs.begin();
414
415 while ((pos != std::string::npos) && (v != printArgs.end()))
416 {
417 Value* pArg = *v;
418 Type* pType = pArg->getType();
419
420 if (pType->isVectorTy())
421 {
422 Type* pContainedType = pType->getContainedType(0);
423
424 if (toupper(tempStr[pos + 1]) == 'X')
425 {
426 tempStr[pos] = '0';
427 tempStr[pos + 1] = 'x';
428 tempStr.insert(pos + 2, "%08X ");
429 pos += 7;
430
431 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
432
433 std::string vectorFormatStr;
434 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
435 {
436 vectorFormatStr += "0x%08X ";
437 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
438 }
439
440 tempStr.insert(pos, vectorFormatStr);
441 pos += vectorFormatStr.size();
442 }
443 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
444 {
445 uint32_t i = 0;
446 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
447 {
448 tempStr.insert(pos, std::string("%f "));
449 pos += 3;
450 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
451 }
452 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
453 }
454 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
455 {
456 uint32_t i = 0;
457 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
458 {
459 tempStr.insert(pos, std::string("%d "));
460 pos += 3;
461 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
462 }
463 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
464 }
465 }
466 else
467 {
468 if (toupper(tempStr[pos + 1]) == 'X')
469 {
470 tempStr[pos] = '0';
471 tempStr.insert(pos + 1, "x%08");
472 printCallArgs.push_back(pArg);
473 pos += 3;
474 }
475 // for %f we need to cast float Values to doubles so that they print out correctly
476 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
477 {
478 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
479 pos++;
480 }
481 else
482 {
483 printCallArgs.push_back(pArg);
484 }
485 }
486
487 // advance to the next arguement
488 v++;
489 pos = tempStr.find('%', ++pos);
490 }
491
492 // create global variable constant string
493 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
494 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
495 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
496
497 // get a pointer to the first character in the constant string array
498 std::vector<Constant*> geplist{C(0),C(0)};
499 #if HAVE_LLVM == 0x306
500 Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
501 #else
502 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
503 #endif
504
505 // insert the pointer to the format string in the argument vector
506 printCallArgs[0] = strGEP;
507
508 // get pointer to CallPrint function and insert decl into the module if needed
509 std::vector<Type*> args;
510 args.push_back(PointerType::get(mInt8Ty,0));
511 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
512 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
513
514 // if we haven't yet added the symbol to the symbol table
515 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
516 {
517 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
518 }
519
520 // insert a call to CallPrint
521 return CALLA(callPrintFn,printCallArgs);
522 }
523
524 //////////////////////////////////////////////////////////////////////////
525 /// @brief Wrapper around PRINT with initializer list.
526 CallInst* Builder::PRINT(const std::string &printStr)
527 {
528 return PRINT(printStr, {});
529 }
530
531 //////////////////////////////////////////////////////////////////////////
532 /// @brief Generate a masked gather operation in LLVM IR. If not
533 /// supported on the underlying platform, emulate it with loads
534 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
535 /// @param pBase - Int8* base VB address pointer value
536 /// @param vIndices - SIMD wide value of VB byte offsets
537 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
538 /// @param scale - value to scale indices by
539 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
540 {
541 Value* vGather;
542
543 // use avx2 gather instruction if available
544 if(JM()->mArch.AVX2())
545 {
546 // force mask to <N x float>, required by vgather
547 vMask = BITCAST(vMask, mSimdFP32Ty);
548 vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
549 }
550 else
551 {
552 Value* pStack = STACKSAVE();
553
554 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
555 Value* vSrcPtr = ALLOCA(vSrc->getType());
556 STORE(vSrc, vSrcPtr);
557
558 vGather = VUNDEF_F();
559 Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
560 Value *vOffsets = MUL(vIndices,vScaleVec);
561 Value *mask = MASK(vMask);
562 for(uint32_t i = 0; i < mVWidth; ++i)
563 {
564 // single component byte index
565 Value *offset = VEXTRACT(vOffsets,C(i));
566 // byte pointer to component
567 Value *loadAddress = GEP(pBase,offset);
568 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
569 // pointer to the value to load if we're masking off a component
570 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
571 Value *selMask = VEXTRACT(mask,C(i));
572 // switch in a safe address to load if we're trying to access a vertex
573 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
574 Value *val = LOAD(validAddress);
575 vGather = VINSERT(vGather,val,C(i));
576 }
577 STACKRESTORE(pStack);
578 }
579
580 return vGather;
581 }
582
583 //////////////////////////////////////////////////////////////////////////
584 /// @brief Generate a masked gather operation in LLVM IR. If not
585 /// supported on the underlying platform, emulate it with loads
586 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
587 /// @param pBase - Int8* base VB address pointer value
588 /// @param vIndices - SIMD wide value of VB byte offsets
589 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
590 /// @param scale - value to scale indices by
591 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
592 {
593 Value* vGather;
594
595 // use avx2 gather instruction if available
596 if(JM()->mArch.AVX2())
597 {
598 vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
599 }
600 else
601 {
602 Value* pStack = STACKSAVE();
603
604 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
605 Value* vSrcPtr = ALLOCA(vSrc->getType());
606 STORE(vSrc, vSrcPtr);
607
608 vGather = VUNDEF_I();
609 Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
610 Value *vOffsets = MUL(vIndices, vScaleVec);
611 Value *mask = MASK(vMask);
612 for(uint32_t i = 0; i < mVWidth; ++i)
613 {
614 // single component byte index
615 Value *offset = VEXTRACT(vOffsets, C(i));
616 // byte pointer to component
617 Value *loadAddress = GEP(pBase, offset);
618 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
619 // pointer to the value to load if we're masking off a component
620 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
621 Value *selMask = VEXTRACT(mask, C(i));
622 // switch in a safe address to load if we're trying to access a vertex
623 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
624 Value *val = LOAD(validAddress, C(0));
625 vGather = VINSERT(vGather, val, C(i));
626 }
627
628 STACKRESTORE(pStack);
629 }
630 return vGather;
631 }
632
633 //////////////////////////////////////////////////////////////////////////
634 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
635 Value* Builder::MASK(Value* vmask)
636 {
637 Value* src = BITCAST(vmask, mSimdInt32Ty);
638 return ICMP_SLT(src, VIMMED1(0));
639 }
640
641 //////////////////////////////////////////////////////////////////////////
642 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
643 Value* Builder::VMASK(Value* mask)
644 {
645 return S_EXT(mask, mSimdInt32Ty);
646 }
647
648 //////////////////////////////////////////////////////////////////////////
649 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
650 /// supported on the underlying platform, emulate it
651 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
652 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
653 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
654 /// 128bits of a, and vice versa for the upper lanes. If the mask
655 /// value is negative, '0' is inserted.
656 Value *Builder::PSHUFB(Value* a, Value* b)
657 {
658 Value* res;
659 // use avx2 pshufb instruction if available
660 if(JM()->mArch.AVX2())
661 {
662 res = VPSHUFB(a, b);
663 }
664 else
665 {
666 Constant* cB = dyn_cast<Constant>(b);
667 // number of 8 bit elements in b
668 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
669 // output vector
670 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
671
672 // insert an 8 bit value from the high and low lanes of a per loop iteration
673 numElms /= 2;
674 for(uint32_t i = 0; i < numElms; i++)
675 {
676 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
677 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
678
679 // extract values from constant mask
680 char valLow128bLane = (char)(cLow128b->getSExtValue());
681 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
682
683 Value* insertValLow128b;
684 Value* insertValHigh128b;
685
686 // if the mask value is negative, insert a '0' in the respective output position
687 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
688 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
689 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
690
691 vShuf = VINSERT(vShuf, insertValLow128b, i);
692 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
693 }
694 res = vShuf;
695 }
696 return res;
697 }
698
699 //////////////////////////////////////////////////////////////////////////
700 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
701 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
702 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
703 /// lower 8 values are used.
704 Value *Builder::PMOVSXBD(Value* a)
705 {
706 // llvm-3.9 removed the pmovsxbd intrinsic
707 #if HAVE_LLVM < 0x309
708 // use avx2 byte sign extend instruction if available
709 if(JM()->mArch.AVX2())
710 {
711 Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
712 return CALL(pmovsxbd, std::initializer_list<Value*>{a});
713 }
714 else
715 #endif
716 {
717 // VPMOVSXBD output type
718 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
719 // Extract 8 values from 128bit lane and sign extend
720 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
721 }
722 }
723
724 //////////////////////////////////////////////////////////////////////////
725 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
726 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
727 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
728 Value *Builder::PMOVSXWD(Value* a)
729 {
730 // llvm-3.9 removed the pmovsxwd intrinsic
731 #if HAVE_LLVM < 0x309
732 // use avx2 word sign extend if available
733 if(JM()->mArch.AVX2())
734 {
735 Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
736 return CALL(pmovsxwd, std::initializer_list<Value*>{a});
737 }
738 else
739 #endif
740 {
741 // VPMOVSXWD output type
742 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
743 // Extract 8 values from 128bit lane and sign extend
744 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
745 }
746 }
747
748 //////////////////////////////////////////////////////////////////////////
749 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
750 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
751 /// platform, emulate it
752 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
753 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
754 Value *Builder::PERMD(Value* a, Value* idx)
755 {
756 Value* res;
757 // use avx2 permute instruction if available
758 if(JM()->mArch.AVX2())
759 {
760 res = VPERMD(a, idx);
761 }
762 else
763 {
764 if (isa<Constant>(idx))
765 {
766 res = VSHUFFLE(a, a, idx);
767 }
768 else
769 {
770 res = VUNDEF_I();
771 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
772 {
773 Value* pIndex = VEXTRACT(idx, C(l));
774 Value* pVal = VEXTRACT(a, pIndex);
775 res = VINSERT(res, pVal, C(l));
776 }
777 }
778 }
779 return res;
780 }
781
782 //////////////////////////////////////////////////////////////////////////
783 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
784 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
785 /// platform, emulate it
786 /// @param a - 256bit SIMD lane(8x32bit) of float values.
787 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
788 Value *Builder::PERMPS(Value* a, Value* idx)
789 {
790 Value* res;
791 // use avx2 permute instruction if available
792 if (JM()->mArch.AVX2())
793 {
794 // llvm 3.6.0 swapped the order of the args to vpermd
795 res = VPERMPS(idx, a);
796 }
797 else
798 {
799 if (isa<Constant>(idx))
800 {
801 res = VSHUFFLE(a, a, idx);
802 }
803 else
804 {
805 res = VUNDEF_F();
806 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
807 {
808 Value* pIndex = VEXTRACT(idx, C(l));
809 Value* pVal = VEXTRACT(a, pIndex);
810 res = VINSERT(res, pVal, C(l));
811 }
812 }
813 }
814
815 return res;
816 }
817
818 //////////////////////////////////////////////////////////////////////////
819 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
820 /// in LLVM IR. If not supported on the underlying platform, emulate it
821 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
822 Value *Builder::CVTPH2PS(Value* a)
823 {
824 if (JM()->mArch.F16C())
825 {
826 return VCVTPH2PS(a);
827 }
828 else
829 {
830 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
831 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
832
833 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
834 {
835 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
836 }
837
838 Value* pResult = UndefValue::get(mSimdFP32Ty);
839 for (uint32_t i = 0; i < mVWidth; ++i)
840 {
841 Value* pSrc = VEXTRACT(a, C(i));
842 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
843 pResult = VINSERT(pResult, pConv, C(i));
844 }
845
846 return pResult;
847 }
848 }
849
850 //////////////////////////////////////////////////////////////////////////
851 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
852 /// in LLVM IR. If not supported on the underlying platform, emulate it
853 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
854 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
855 {
856 if (JM()->mArch.F16C())
857 {
858 return VCVTPS2PH(a, rounding);
859 }
860 else
861 {
862 // call scalar C function for now
863 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
864 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
865
866 if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
867 {
868 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
869 }
870
871 Value* pResult = UndefValue::get(mSimdInt16Ty);
872 for (uint32_t i = 0; i < mVWidth; ++i)
873 {
874 Value* pSrc = VEXTRACT(a, C(i));
875 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
876 pResult = VINSERT(pResult, pConv, C(i));
877 }
878
879 return pResult;
880 }
881 }
882
883 Value *Builder::PMAXSD(Value* a, Value* b)
884 {
885 // llvm-3.9 removed the pmax intrinsics
886 #if HAVE_LLVM >= 0x309
887 Value* cmp = ICMP_SGT(a, b);
888 return SELECT(cmp, a, b);
889 #else
890 if (JM()->mArch.AVX2())
891 {
892 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
893 return CALL(pmaxsd, {a, b});
894 }
895 else
896 {
897 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
898 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
899
900 // low 128
901 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
902 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
903 Value* resLo = CALL(pmaxsd, {aLo, bLo});
904
905 // high 128
906 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
907 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
908 Value* resHi = CALL(pmaxsd, {aHi, bHi});
909
910 // combine
911 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
912 result = VINSERTI128(result, resHi, C((uint8_t)1));
913
914 return result;
915 }
916 #endif
917 }
918
919 Value *Builder::PMINSD(Value* a, Value* b)
920 {
921 // llvm-3.9 removed the pmin intrinsics
922 #if HAVE_LLVM >= 0x309
923 Value* cmp = ICMP_SLT(a, b);
924 return SELECT(cmp, a, b);
925 #else
926 if (JM()->mArch.AVX2())
927 {
928 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
929 return CALL(pminsd, {a, b});
930 }
931 else
932 {
933 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
934 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
935
936 // low 128
937 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
938 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
939 Value* resLo = CALL(pminsd, {aLo, bLo});
940
941 // high 128
942 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
943 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
944 Value* resHi = CALL(pminsd, {aHi, bHi});
945
946 // combine
947 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
948 result = VINSERTI128(result, resHi, C((uint8_t)1));
949
950 return result;
951 }
952 #endif
953 }
954
955 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
956 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
957 {
958 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
959 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
960 {
961 // ensure our mask is the correct type
962 mask = BITCAST(mask, mSimdFP32Ty);
963 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
964 }
965 else
966 {
967 // ensure our mask is the correct type
968 mask = BITCAST(mask, mSimdInt32Ty);
969 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
970 }
971 }
972
973 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
974 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
975 {
976 switch(info.bpp / info.numComps)
977 {
978 case 16:
979 {
980 Value* vGatherResult[2];
981 Value *vMask;
982
983 // TODO: vGatherMaskedVal
984 Value* vGatherMaskedVal = VIMMED1((float)0);
985
986 // always have at least one component out of x or y to fetch
987
988 // save mask as it is zero'd out after each gather
989 vMask = mask;
990
991 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
992 // e.g. result of first 8x32bit integer gather for 16bit components
993 // 256i - 0 1 2 3 4 5 6 7
994 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
995 //
996
997 // if we have at least one component out of x or y to fetch
998 if(info.numComps > 2)
999 {
1000 // offset base to the next components(zw) in the vertex to gather
1001 pSrcBase = GEP(pSrcBase, C((char)4));
1002 vMask = mask;
1003
1004 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1005 // e.g. result of second 8x32bit integer gather for 16bit components
1006 // 256i - 0 1 2 3 4 5 6 7
1007 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1008 //
1009 }
1010 else
1011 {
1012 vGatherResult[1] = vGatherMaskedVal;
1013 }
1014
1015 // Shuffle gathered components into place, each row is a component
1016 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1017 }
1018 break;
1019 case 32:
1020 {
1021 // apply defaults
1022 for (uint32_t i = 0; i < 4; ++i)
1023 {
1024 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1025 }
1026
1027 for(uint32_t i = 0; i < info.numComps; i++)
1028 {
1029 uint32_t swizzleIndex = info.swizzle[i];
1030
1031 // save mask as it is zero'd out after each gather
1032 Value *vMask = mask;
1033
1034 // Gather a SIMD of components
1035 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1036
1037 // offset base to the next component to gather
1038 pSrcBase = GEP(pSrcBase, C((char)4));
1039 }
1040 }
1041 break;
1042 default:
1043 SWR_ASSERT(0, "Invalid float format");
1044 break;
1045 }
1046 }
1047
1048 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1049 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1050 {
1051 switch (info.bpp / info.numComps)
1052 {
1053 case 8:
1054 {
1055 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1056 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1057 // e.g. result of an 8x32bit integer gather for 8bit components
1058 // 256i - 0 1 2 3 4 5 6 7
1059 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1060
1061 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1062 }
1063 break;
1064 case 16:
1065 {
1066 Value* vGatherResult[2];
1067 Value *vMask;
1068
1069 // TODO: vGatherMaskedVal
1070 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1071
1072 // always have at least one component out of x or y to fetch
1073
1074 // save mask as it is zero'd out after each gather
1075 vMask = mask;
1076
1077 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1078 // e.g. result of first 8x32bit integer gather for 16bit components
1079 // 256i - 0 1 2 3 4 5 6 7
1080 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1081 //
1082
1083 // if we have at least one component out of x or y to fetch
1084 if(info.numComps > 2)
1085 {
1086 // offset base to the next components(zw) in the vertex to gather
1087 pSrcBase = GEP(pSrcBase, C((char)4));
1088 vMask = mask;
1089
1090 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1091 // e.g. result of second 8x32bit integer gather for 16bit components
1092 // 256i - 0 1 2 3 4 5 6 7
1093 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1094 //
1095 }
1096 else
1097 {
1098 vGatherResult[1] = vGatherMaskedVal;
1099 }
1100
1101 // Shuffle gathered components into place, each row is a component
1102 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1103
1104 }
1105 break;
1106 case 32:
1107 {
1108 // apply defaults
1109 for (uint32_t i = 0; i < 4; ++i)
1110 {
1111 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1112 }
1113
1114 for(uint32_t i = 0; i < info.numComps; i++)
1115 {
1116 uint32_t swizzleIndex = info.swizzle[i];
1117
1118 // save mask as it is zero'd out after each gather
1119 Value *vMask = mask;
1120
1121 // Gather a SIMD of components
1122 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1123
1124 // offset base to the next component to gather
1125 pSrcBase = GEP(pSrcBase, C((char)4));
1126 }
1127 }
1128 break;
1129 default:
1130 SWR_ASSERT(0, "unsupported format");
1131 break;
1132 }
1133 }
1134
1135 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1136 {
1137 // cast types
1138 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1139 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1140
1141 // input could either be float or int vector; do shuffle work in int
1142 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1143 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1144
1145 if(bPackedOutput)
1146 {
1147 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1148
1149 // shuffle mask
1150 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1151 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1152 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1153 // after pshufb: group components together in each 128bit lane
1154 // 256i - 0 1 2 3 4 5 6 7
1155 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1156
1157 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1158 // after PERMD: move and pack xy components into each 128bit lane
1159 // 256i - 0 1 2 3 4 5 6 7
1160 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1161
1162 // do the same for zw components
1163 Value* vi128ZW = nullptr;
1164 if(info.numComps > 2)
1165 {
1166 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1167 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1168 }
1169
1170 for(uint32_t i = 0; i < 4; i++)
1171 {
1172 uint32_t swizzleIndex = info.swizzle[i];
1173 // todo: fixed for packed
1174 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1175 if(i >= info.numComps)
1176 {
1177 // set the default component val
1178 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1179 continue;
1180 }
1181
1182 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1183 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1184 // if x or y, use vi128XY permute result, else use vi128ZW
1185 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1186
1187 // extract packed component 128 bit lanes
1188 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1189 }
1190
1191 }
1192 else
1193 {
1194 // pshufb masks for each component
1195 Value* vConstMask[2];
1196 // x/z shuffle mask
1197 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1198 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1199
1200 // y/w shuffle mask
1201 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1202 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1203
1204
1205 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1206 // apply defaults
1207 for (uint32_t i = 0; i < 4; ++i)
1208 {
1209 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1210 }
1211
1212 for(uint32_t i = 0; i < info.numComps; i++)
1213 {
1214 uint32_t swizzleIndex = info.swizzle[i];
1215
1216 // select correct constMask for x/z or y/w pshufb
1217 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1218 // if x or y, use vi128XY permute result, else use vi128ZW
1219 uint32_t selectedGather = (i < 2) ? 0 : 1;
1220
1221 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1222 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1223 // 256i - 0 1 2 3 4 5 6 7
1224 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1225 }
1226 }
1227 }
1228
1229 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1230 {
1231 // cast types
1232 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1233 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1234
1235 if(bPackedOutput)
1236 {
1237 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1238 // shuffle mask
1239 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1240 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1241 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1242 // after pshufb: group components together in each 128bit lane
1243 // 256i - 0 1 2 3 4 5 6 7
1244 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1245
1246 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1247 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1248 // 256i - 0 1 2 3 4 5 6 7
1249 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1250
1251 // do the same for zw components
1252 Value* vi128ZW = nullptr;
1253 if(info.numComps > 2)
1254 {
1255 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1256 }
1257
1258 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1259 for(uint32_t i = 0; i < 4; i++)
1260 {
1261 uint32_t swizzleIndex = info.swizzle[i];
1262 // todo: fix for packed
1263 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1264 if(i >= info.numComps)
1265 {
1266 // set the default component val
1267 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1268 continue;
1269 }
1270
1271 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1272 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1273 // if x or y, use vi128XY permute result, else use vi128ZW
1274 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1275
1276 // sign extend
1277 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1278 }
1279 }
1280 // else zero extend
1281 else{
1282 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1283 // apply defaults
1284 for (uint32_t i = 0; i < 4; ++i)
1285 {
1286 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1287 }
1288
1289 for(uint32_t i = 0; i < info.numComps; i++){
1290 uint32_t swizzleIndex = info.swizzle[i];
1291
1292 // pshufb masks for each component
1293 Value* vConstMask;
1294 switch(i)
1295 {
1296 case 0:
1297 // x shuffle mask
1298 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1299 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1300 break;
1301 case 1:
1302 // y shuffle mask
1303 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1304 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1305 break;
1306 case 2:
1307 // z shuffle mask
1308 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1309 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1310 break;
1311 case 3:
1312 // w shuffle mask
1313 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1314 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1315 break;
1316 default:
1317 vConstMask = nullptr;
1318 break;
1319 }
1320
1321 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1322 // after pshufb for x channel
1323 // 256i - 0 1 2 3 4 5 6 7
1324 // x000 x000 x000 x000 x000 x000 x000 x000
1325 }
1326 }
1327 }
1328
1329 // Helper function to create alloca in entry block of function
1330 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1331 {
1332 auto saveIP = IRB()->saveIP();
1333 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1334 pFunc->getEntryBlock().begin());
1335 Value* pAlloca = ALLOCA(pType);
1336 IRB()->restoreIP(saveIP);
1337 return pAlloca;
1338 }
1339
1340 //////////////////////////////////////////////////////////////////////////
1341 /// @brief emulates a scatter operation.
1342 /// @param pDst - pointer to destination
1343 /// @param vSrc - vector of src data to scatter
1344 /// @param vOffsets - vector of byte offsets from pDst
1345 /// @param vMask - mask of valid lanes
1346 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1347 {
1348 /* Scatter algorithm
1349
1350 while(Index = BitScanForward(mask))
1351 srcElem = srcVector[Index]
1352 offsetElem = offsetVector[Index]
1353 *(pDst + offsetElem) = srcElem
1354 Update mask (&= ~(1<<Index)
1355
1356 */
1357
1358 BasicBlock* pCurBB = IRB()->GetInsertBlock();
1359 Function* pFunc = pCurBB->getParent();
1360 Type* pSrcTy = vSrc->getType()->getVectorElementType();
1361
1362 // Store vectors on stack
1363 if (pScatterStackSrc == nullptr)
1364 {
1365 // Save off stack allocations and reuse per scatter. Significantly reduces stack
1366 // requirements for shaders with a lot of scatters.
1367 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1368 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1369 }
1370
1371 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1372 Value* pOffsetsArrayPtr = pScatterStackOffsets;
1373 STORE(vSrc, pSrcArrayPtr);
1374 STORE(vOffsets, pOffsetsArrayPtr);
1375
1376 // Cast to pointers for random access
1377 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1378 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1379
1380 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1381
1382 // Get cttz function
1383 Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1384
1385 // Setup loop basic block
1386 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1387
1388 // compute first set bit
1389 Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1390
1391 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1392
1393 // Split current block
1394 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1395
1396 // Remove unconditional jump created by splitBasicBlock
1397 pCurBB->getTerminator()->eraseFromParent();
1398
1399 // Add terminator to end of original block
1400 IRB()->SetInsertPoint(pCurBB);
1401
1402 // Add conditional branch
1403 COND_BR(pIsUndef, pPostLoop, pLoop);
1404
1405 // Add loop basic block contents
1406 IRB()->SetInsertPoint(pLoop);
1407 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1408 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1409
1410 pIndexPhi->addIncoming(pIndex, pCurBB);
1411 pMaskPhi->addIncoming(pMask, pCurBB);
1412
1413 // Extract elements for this index
1414 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1415 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1416
1417 // GEP to this offset in dst
1418 Value* pCurDst = GEP(pDst, pOffsetElem);
1419 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1420 STORE(pSrcElem, pCurDst);
1421
1422 // Update the mask
1423 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1424
1425 // Terminator
1426 Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1427
1428 pIsUndef = ICMP_EQ(pNewIndex, C(32));
1429 COND_BR(pIsUndef, pPostLoop, pLoop);
1430
1431 // Update phi edges
1432 pIndexPhi->addIncoming(pNewIndex, pLoop);
1433 pMaskPhi->addIncoming(pNewMask, pLoop);
1434
1435 // Move builder to beginning of post loop
1436 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1437 }
1438
1439 Value* Builder::VABSPS(Value* a)
1440 {
1441 Value* asInt = BITCAST(a, mSimdInt32Ty);
1442 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1443 return result;
1444 }
1445
1446 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1447 {
1448 Value *lowCmp = ICMP_SLT(src, low);
1449 Value *ret = SELECT(lowCmp, low, src);
1450
1451 Value *highCmp = ICMP_SGT(ret, high);
1452 ret = SELECT(highCmp, high, ret);
1453
1454 return ret;
1455 }
1456
1457 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1458 {
1459 Value *lowCmp = FCMP_OLT(src, low);
1460 Value *ret = SELECT(lowCmp, low, src);
1461
1462 Value *highCmp = FCMP_OGT(ret, high);
1463 ret = SELECT(highCmp, high, ret);
1464
1465 return ret;
1466 }
1467
1468 Value *Builder::FCLAMP(Value* src, float low, float high)
1469 {
1470 Value* result = VMAXPS(src, VIMMED1(low));
1471 result = VMINPS(result, VIMMED1(high));
1472
1473 return result;
1474 }
1475
1476 //////////////////////////////////////////////////////////////////////////
1477 /// @brief save/restore stack, providing ability to push/pop the stack and
1478 /// reduce overall stack requirements for temporary stack use
1479 Value* Builder::STACKSAVE()
1480 {
1481 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1482 #if HAVE_LLVM == 0x306
1483 return CALL(pfnStackSave);
1484 #else
1485 return CALLA(pfnStackSave);
1486 #endif
1487 }
1488
1489 void Builder::STACKRESTORE(Value* pSaved)
1490 {
1491 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1492 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1493 }
1494
1495 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1496 {
1497 Value* vOut;
1498 // use FMADs if available
1499 if(JM()->mArch.AVX2())
1500 {
1501 vOut = VFMADDPS(a, b, c);
1502 }
1503 else
1504 {
1505 vOut = FADD(FMUL(a, b), c);
1506 }
1507 return vOut;
1508 }
1509
1510 Value* Builder::POPCNT(Value* a)
1511 {
1512 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1513 return CALL(pCtPop, std::initializer_list<Value*>{a});
1514 }
1515
1516 //////////////////////////////////////////////////////////////////////////
1517 /// @brief C functions called by LLVM IR
1518 //////////////////////////////////////////////////////////////////////////
1519
1520 //////////////////////////////////////////////////////////////////////////
1521 /// @brief called in JIT code, inserted by PRINT
1522 /// output to both stdout and visual studio debug console
1523 void __cdecl CallPrint(const char* fmt, ...)
1524 {
1525 va_list args;
1526 va_start(args, fmt);
1527 vprintf(fmt, args);
1528
1529 #if defined( _WIN32 )
1530 char strBuf[1024];
1531 vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1532 OutputDebugString(strBuf);
1533 #endif
1534
1535 va_end(args);
1536 }
1537
1538 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1539 {
1540 #if HAVE_LLVM == 0x306
1541 Function *func =
1542 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1543 Intrinsic::x86_avx_vextractf128_si_256);
1544 return CALL(func, {a, imm8});
1545 #else
1546 bool flag = !imm8->isZeroValue();
1547 SmallVector<Constant*,8> idx;
1548 for (unsigned i = 0; i < mVWidth / 2; i++) {
1549 idx.push_back(C(flag ? i + mVWidth / 2 : i));
1550 }
1551 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1552 #endif
1553 }
1554
1555 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1556 {
1557 #if HAVE_LLVM == 0x306
1558 Function *func =
1559 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1560 Intrinsic::x86_avx_vinsertf128_si_256);
1561 return CALL(func, {a, b, imm8});
1562 #else
1563 bool flag = !imm8->isZeroValue();
1564 SmallVector<Constant*,8> idx;
1565 for (unsigned i = 0; i < mVWidth; i++) {
1566 idx.push_back(C(i));
1567 }
1568 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1569
1570 SmallVector<Constant*,8> idx2;
1571 for (unsigned i = 0; i < mVWidth / 2; i++) {
1572 idx2.push_back(C(flag ? i : i + mVWidth));
1573 }
1574 for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1575 idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1576 }
1577 return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1578 #endif
1579 }
1580
1581 // rdtsc buckets macros
1582 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1583 {
1584 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1585 // buckets framework when single threaded
1586 if (KNOB_SINGLE_THREADED)
1587 {
1588 std::vector<Type*> args{
1589 PointerType::get(mInt32Ty, 0), // pBucketMgr
1590 mInt32Ty // id
1591 };
1592
1593 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1594 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1595 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1596 {
1597 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1598 }
1599
1600 CALL(pFunc, { pBucketMgr, pId });
1601 }
1602 }
1603
1604 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1605 {
1606 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1607 // buckets framework when single threaded
1608 if (KNOB_SINGLE_THREADED)
1609 {
1610 std::vector<Type*> args{
1611 PointerType::get(mInt32Ty, 0), // pBucketMgr
1612 mInt32Ty // id
1613 };
1614
1615 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1616 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1617 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1618 {
1619 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1620 }
1621
1622 CALL(pFunc, { pBucketMgr, pId });
1623 }
1624 }
1625
1626 }