swr: [rasterizer] Slight assert refactoring
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "common/rdtsc_buckets.h"
32
33 #include <cstdarg>
34
35 namespace SwrJit
36 {
37 void __cdecl CallPrint(const char* fmt, ...);
38
39 //////////////////////////////////////////////////////////////////////////
40 /// @brief Convert an IEEE 754 32-bit single precision float to an
41 /// 16 bit float with 5 exponent bits and a variable
42 /// number of mantissa bits.
43 /// @param val - 32-bit float
44 /// @todo Maybe move this outside of this file into a header?
45 static uint16_t Convert32To16Float(float val)
46 {
47 uint32_t sign, exp, mant;
48 uint32_t roundBits;
49
50 // Extract the sign, exponent, and mantissa
51 uint32_t uf = *(uint32_t*)&val;
52 sign = (uf & 0x80000000) >> 31;
53 exp = (uf & 0x7F800000) >> 23;
54 mant = uf & 0x007FFFFF;
55
56 // Check for out of range
57 if (std::isnan(val))
58 {
59 exp = 0x1F;
60 mant = 0x200;
61 sign = 1; // set the sign bit for NANs
62 }
63 else if (std::isinf(val))
64 {
65 exp = 0x1f;
66 mant = 0x0;
67 }
68 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
69 {
70 exp = 0x1E;
71 mant = 0x3FF;
72 }
73 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
74 {
75 mant |= 0x00800000;
76 for (; exp <= 0x70; mant >>= 1, exp++)
77 ;
78 exp = 0;
79 mant = mant >> 13;
80 }
81 else if (exp < 0x66) // Too small to represent -> Zero
82 {
83 exp = 0;
84 mant = 0;
85 }
86 else
87 {
88 // Saves bits that will be shifted off for rounding
89 roundBits = mant & 0x1FFFu;
90 // convert exponent and mantissa to 16 bit format
91 exp = exp - 0x70;
92 mant = mant >> 13;
93
94 // Essentially RTZ, but round up if off by only 1 lsb
95 if (roundBits == 0x1FFFu)
96 {
97 mant++;
98 // check for overflow
99 if ((mant & 0xC00u) != 0)
100 exp++;
101 // make sure only the needed bits are used
102 mant &= 0x3FF;
103 }
104 }
105
106 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
107 return (uint16_t)tmpVal;
108 }
109
110 //////////////////////////////////////////////////////////////////////////
111 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
112 /// float
113 /// @param val - 16-bit float
114 /// @todo Maybe move this outside of this file into a header?
115 static float ConvertSmallFloatTo32(UINT val)
116 {
117 UINT result;
118 if ((val & 0x7fff) == 0)
119 {
120 result = ((uint32_t)(val & 0x8000)) << 16;
121 }
122 else if ((val & 0x7c00) == 0x7c00)
123 {
124 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
125 result |= ((uint32_t)val & 0x8000) << 16;
126 }
127 else
128 {
129 uint32_t sign = (val & 0x8000) << 16;
130 uint32_t mant = (val & 0x3ff) << 13;
131 uint32_t exp = (val >> 10) & 0x1f;
132 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
133 {
134 mant <<= 1;
135 while (mant < (0x400 << 13))
136 {
137 exp--;
138 mant <<= 1;
139 }
140 mant &= (0x3ff << 13);
141 }
142 exp = ((exp - 15 + 127) & 0xff) << 23;
143 result = sign | exp | mant;
144 }
145
146 return *(float*)&result;
147 }
148
149 Constant *Builder::C(bool i)
150 {
151 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
152 }
153
154 Constant *Builder::C(char i)
155 {
156 return ConstantInt::get(IRB()->getInt8Ty(), i);
157 }
158
159 Constant *Builder::C(uint8_t i)
160 {
161 return ConstantInt::get(IRB()->getInt8Ty(), i);
162 }
163
164 Constant *Builder::C(int i)
165 {
166 return ConstantInt::get(IRB()->getInt32Ty(), i);
167 }
168
169 Constant *Builder::C(int64_t i)
170 {
171 return ConstantInt::get(IRB()->getInt64Ty(), i);
172 }
173
174 Constant *Builder::C(uint16_t i)
175 {
176 return ConstantInt::get(mInt16Ty,i);
177 }
178
179 Constant *Builder::C(uint32_t i)
180 {
181 return ConstantInt::get(IRB()->getInt32Ty(), i);
182 }
183
184 Constant *Builder::C(float i)
185 {
186 return ConstantFP::get(IRB()->getFloatTy(), i);
187 }
188
189 Constant *Builder::PRED(bool pred)
190 {
191 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
192 }
193
194 Value *Builder::VIMMED1(int i)
195 {
196 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
197 }
198
199 Value *Builder::VIMMED1(uint32_t i)
200 {
201 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
202 }
203
204 Value *Builder::VIMMED1(float i)
205 {
206 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
207 }
208
209 Value *Builder::VIMMED1(bool i)
210 {
211 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
212 }
213
214 Value *Builder::VUNDEF_IPTR()
215 {
216 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
217 }
218
219 Value *Builder::VUNDEF_I()
220 {
221 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
222 }
223
224 Value *Builder::VUNDEF(Type *ty, uint32_t size)
225 {
226 return UndefValue::get(VectorType::get(ty, size));
227 }
228
229 Value *Builder::VUNDEF_F()
230 {
231 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
232 }
233
234 Value *Builder::VUNDEF(Type* t)
235 {
236 return UndefValue::get(VectorType::get(t, mVWidth));
237 }
238
239 #if HAVE_LLVM == 0x306
240 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
241 {
242 return VINSERT(vec, val, C((int64_t)index));
243 }
244 #endif
245
246 Value *Builder::VBROADCAST(Value *src)
247 {
248 // check if src is already a vector
249 if (src->getType()->isVectorTy())
250 {
251 return src;
252 }
253
254 return VECTOR_SPLAT(mVWidth, src);
255 }
256
257 uint32_t Builder::IMMED(Value* v)
258 {
259 SWR_ASSERT(isa<ConstantInt>(v));
260 ConstantInt *pValConst = cast<ConstantInt>(v);
261 return pValConst->getZExtValue();
262 }
263
264 int32_t Builder::S_IMMED(Value* v)
265 {
266 SWR_ASSERT(isa<ConstantInt>(v));
267 ConstantInt *pValConst = cast<ConstantInt>(v);
268 return pValConst->getSExtValue();
269 }
270
271 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
272 {
273 std::vector<Value*> indices;
274 for (auto i : indexList)
275 indices.push_back(i);
276 return GEPA(ptr, indices);
277 }
278
279 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
280 {
281 std::vector<Value*> indices;
282 for (auto i : indexList)
283 indices.push_back(C(i));
284 return GEPA(ptr, indices);
285 }
286
287 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
288 {
289 std::vector<Value*> valIndices;
290 for (auto i : indices)
291 valIndices.push_back(C(i));
292 return LOAD(GEPA(basePtr, valIndices), name);
293 }
294
295 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
296 {
297 std::vector<Value*> valIndices;
298 for (auto i : indices)
299 valIndices.push_back(i);
300 return LOAD(GEPA(basePtr, valIndices), name);
301 }
302
303 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
304 {
305 std::vector<Value*> valIndices;
306 for (auto i : indices)
307 valIndices.push_back(C(i));
308 return STORE(val, GEPA(basePtr, valIndices));
309 }
310
311 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
312 {
313 std::vector<Value*> valIndices;
314 for (auto i : indices)
315 valIndices.push_back(i);
316 return STORE(val, GEPA(basePtr, valIndices));
317 }
318
319 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
320 {
321 std::vector<Value*> args;
322 for (auto arg : argsList)
323 args.push_back(arg);
324 return CALLA(Callee, args);
325 }
326
327 #if HAVE_LLVM > 0x306
328 CallInst *Builder::CALL(Value *Callee, Value* arg)
329 {
330 std::vector<Value*> args;
331 args.push_back(arg);
332 return CALLA(Callee, args);
333 }
334
335 CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
336 {
337 std::vector<Value*> args;
338 args.push_back(arg1);
339 args.push_back(arg2);
340 return CALLA(Callee, args);
341 }
342
343 CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
344 {
345 std::vector<Value*> args;
346 args.push_back(arg1);
347 args.push_back(arg2);
348 args.push_back(arg3);
349 return CALLA(Callee, args);
350 }
351 #endif
352
353 //////////////////////////////////////////////////////////////////////////
354 Value *Builder::DEBUGTRAP()
355 {
356 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
357 return CALL(func);
358 }
359
360 Value *Builder::VRCP(Value *va)
361 {
362 return FDIV(VIMMED1(1.0f), va); // 1 / a
363 }
364
365 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
366 {
367 Value* vOut = FMADDPS(vA, vX, vC);
368 vOut = FMADDPS(vB, vY, vOut);
369 return vOut;
370 }
371
372 //////////////////////////////////////////////////////////////////////////
373 /// @brief Generate an i32 masked load operation in LLVM IR. If not
374 /// supported on the underlying platform, emulate it with float masked load
375 /// @param src - base address pointer for the load
376 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
377 Value *Builder::MASKLOADD(Value* src,Value* mask)
378 {
379 Value* vResult;
380 // use avx2 gather instruction is available
381 if(JM()->mArch.AVX2())
382 {
383 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
384 vResult = CALL(func,{src,mask});
385 }
386 else
387 {
388 // maskload intrinsic expects integer mask operand in llvm >= 3.8
389 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
390 mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
391 #else
392 mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
393 #endif
394 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
395 vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
396 }
397 return vResult;
398 }
399
400 //////////////////////////////////////////////////////////////////////////
401 /// @brief insert a JIT call to CallPrint
402 /// - outputs formatted string to both stdout and VS output window
403 /// - DEBUG builds only
404 /// Usage example:
405 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
406 /// where C(lane) creates a constant value to print, and pIndex is the Value*
407 /// result from a GEP, printing out the pointer to memory
408 /// @param printStr - constant string to print, which includes format specifiers
409 /// @param printArgs - initializer list of Value*'s to print to std out
410 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
411 {
412 // push the arguments to CallPrint into a vector
413 std::vector<Value*> printCallArgs;
414 // save room for the format string. we still need to modify it for vectors
415 printCallArgs.resize(1);
416
417 // search through the format string for special processing
418 size_t pos = 0;
419 std::string tempStr(printStr);
420 pos = tempStr.find('%', pos);
421 auto v = printArgs.begin();
422
423 while ((pos != std::string::npos) && (v != printArgs.end()))
424 {
425 Value* pArg = *v;
426 Type* pType = pArg->getType();
427
428 if (pType->isVectorTy())
429 {
430 Type* pContainedType = pType->getContainedType(0);
431
432 if (toupper(tempStr[pos + 1]) == 'X')
433 {
434 tempStr[pos] = '0';
435 tempStr[pos + 1] = 'x';
436 tempStr.insert(pos + 2, "%08X ");
437 pos += 7;
438
439 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
440
441 std::string vectorFormatStr;
442 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
443 {
444 vectorFormatStr += "0x%08X ";
445 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
446 }
447
448 tempStr.insert(pos, vectorFormatStr);
449 pos += vectorFormatStr.size();
450 }
451 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
452 {
453 uint32_t i = 0;
454 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
455 {
456 tempStr.insert(pos, std::string("%f "));
457 pos += 3;
458 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
459 }
460 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
461 }
462 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
463 {
464 uint32_t i = 0;
465 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
466 {
467 tempStr.insert(pos, std::string("%d "));
468 pos += 3;
469 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
470 }
471 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
472 }
473 }
474 else
475 {
476 if (toupper(tempStr[pos + 1]) == 'X')
477 {
478 tempStr[pos] = '0';
479 tempStr.insert(pos + 1, "x%08");
480 printCallArgs.push_back(pArg);
481 pos += 3;
482 }
483 // for %f we need to cast float Values to doubles so that they print out correctly
484 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
485 {
486 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
487 pos++;
488 }
489 else
490 {
491 printCallArgs.push_back(pArg);
492 }
493 }
494
495 // advance to the next arguement
496 v++;
497 pos = tempStr.find('%', ++pos);
498 }
499
500 // create global variable constant string
501 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
502 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
503 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
504
505 // get a pointer to the first character in the constant string array
506 std::vector<Constant*> geplist{C(0),C(0)};
507 #if HAVE_LLVM == 0x306
508 Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
509 #else
510 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
511 #endif
512
513 // insert the pointer to the format string in the argument vector
514 printCallArgs[0] = strGEP;
515
516 // get pointer to CallPrint function and insert decl into the module if needed
517 std::vector<Type*> args;
518 args.push_back(PointerType::get(mInt8Ty,0));
519 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
520 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
521
522 // if we haven't yet added the symbol to the symbol table
523 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
524 {
525 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
526 }
527
528 // insert a call to CallPrint
529 return CALLA(callPrintFn,printCallArgs);
530 }
531
532 //////////////////////////////////////////////////////////////////////////
533 /// @brief Wrapper around PRINT with initializer list.
534 CallInst* Builder::PRINT(const std::string &printStr)
535 {
536 return PRINT(printStr, {});
537 }
538
539 //////////////////////////////////////////////////////////////////////////
540 /// @brief Generate a masked gather operation in LLVM IR. If not
541 /// supported on the underlying platform, emulate it with loads
542 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
543 /// @param pBase - Int8* base VB address pointer value
544 /// @param vIndices - SIMD wide value of VB byte offsets
545 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
546 /// @param scale - value to scale indices by
547 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
548 {
549 Value* vGather;
550
551 // use avx2 gather instruction if available
552 if(JM()->mArch.AVX2())
553 {
554 // force mask to <N x float>, required by vgather
555 vMask = BITCAST(vMask, mSimdFP32Ty);
556 vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
557 }
558 else
559 {
560 Value* pStack = STACKSAVE();
561
562 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
563 Value* vSrcPtr = ALLOCA(vSrc->getType());
564 STORE(vSrc, vSrcPtr);
565
566 vGather = VUNDEF_F();
567 Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
568 Value *vOffsets = MUL(vIndices,vScaleVec);
569 Value *mask = MASK(vMask);
570 for(uint32_t i = 0; i < mVWidth; ++i)
571 {
572 // single component byte index
573 Value *offset = VEXTRACT(vOffsets,C(i));
574 // byte pointer to component
575 Value *loadAddress = GEP(pBase,offset);
576 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
577 // pointer to the value to load if we're masking off a component
578 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
579 Value *selMask = VEXTRACT(mask,C(i));
580 // switch in a safe address to load if we're trying to access a vertex
581 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
582 Value *val = LOAD(validAddress);
583 vGather = VINSERT(vGather,val,C(i));
584 }
585 STACKRESTORE(pStack);
586 }
587
588 return vGather;
589 }
590
591 //////////////////////////////////////////////////////////////////////////
592 /// @brief Generate a masked gather operation in LLVM IR. If not
593 /// supported on the underlying platform, emulate it with loads
594 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
595 /// @param pBase - Int8* base VB address pointer value
596 /// @param vIndices - SIMD wide value of VB byte offsets
597 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
598 /// @param scale - value to scale indices by
599 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
600 {
601 Value* vGather;
602
603 // use avx2 gather instruction if available
604 if(JM()->mArch.AVX2())
605 {
606 vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
607 }
608 else
609 {
610 Value* pStack = STACKSAVE();
611
612 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
613 Value* vSrcPtr = ALLOCA(vSrc->getType());
614 STORE(vSrc, vSrcPtr);
615
616 vGather = VUNDEF_I();
617 Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
618 Value *vOffsets = MUL(vIndices, vScaleVec);
619 Value *mask = MASK(vMask);
620 for(uint32_t i = 0; i < mVWidth; ++i)
621 {
622 // single component byte index
623 Value *offset = VEXTRACT(vOffsets, C(i));
624 // byte pointer to component
625 Value *loadAddress = GEP(pBase, offset);
626 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
627 // pointer to the value to load if we're masking off a component
628 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
629 Value *selMask = VEXTRACT(mask, C(i));
630 // switch in a safe address to load if we're trying to access a vertex
631 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
632 Value *val = LOAD(validAddress, C(0));
633 vGather = VINSERT(vGather, val, C(i));
634 }
635
636 STACKRESTORE(pStack);
637 }
638 return vGather;
639 }
640
641 //////////////////////////////////////////////////////////////////////////
642 /// @brief Generate a masked gather operation in LLVM IR. If not
643 /// supported on the underlying platform, emulate it with loads
644 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
645 /// @param pBase - Int8* base VB address pointer value
646 /// @param vIndices - SIMD wide value of VB byte offsets
647 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
648 /// @param scale - value to scale indices by
649 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
650 {
651 Value* vGather;
652
653 // use avx2 gather instruction if available
654 if(JM()->mArch.AVX2())
655 {
656 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale);
657 }
658 else
659 {
660 Value* pStack = STACKSAVE();
661
662 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
663 Value* vSrcPtr = ALLOCA(vSrc->getType());
664 STORE(vSrc, vSrcPtr);
665
666 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
667 Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty));
668 Value *vOffsets = MUL(vIndices,vScaleVec);
669 Value *mask = MASK(vMask);
670 for(uint32_t i = 0; i < mVWidth/2; ++i)
671 {
672 // single component byte index
673 Value *offset = VEXTRACT(vOffsets,C(i));
674 // byte pointer to component
675 Value *loadAddress = GEP(pBase,offset);
676 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
677 // pointer to the value to load if we're masking off a component
678 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
679 Value *selMask = VEXTRACT(mask,C(i));
680 // switch in a safe address to load if we're trying to access a vertex
681 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
682 Value *val = LOAD(validAddress);
683 vGather = VINSERT(vGather,val,C(i));
684 }
685 STACKRESTORE(pStack);
686 }
687 return vGather;
688 }
689
690 //////////////////////////////////////////////////////////////////////////
691 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
692 Value* Builder::MASK(Value* vmask)
693 {
694 Value* src = BITCAST(vmask, mSimdInt32Ty);
695 return ICMP_SLT(src, VIMMED1(0));
696 }
697
698 //////////////////////////////////////////////////////////////////////////
699 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
700 Value* Builder::VMASK(Value* mask)
701 {
702 return S_EXT(mask, mSimdInt32Ty);
703 }
704
705 //////////////////////////////////////////////////////////////////////////
706 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
707 /// supported on the underlying platform, emulate it
708 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
709 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
710 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
711 /// 128bits of a, and vice versa for the upper lanes. If the mask
712 /// value is negative, '0' is inserted.
713 Value *Builder::PSHUFB(Value* a, Value* b)
714 {
715 Value* res;
716 // use avx2 pshufb instruction if available
717 if(JM()->mArch.AVX2())
718 {
719 res = VPSHUFB(a, b);
720 }
721 else
722 {
723 Constant* cB = dyn_cast<Constant>(b);
724 // number of 8 bit elements in b
725 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
726 // output vector
727 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
728
729 // insert an 8 bit value from the high and low lanes of a per loop iteration
730 numElms /= 2;
731 for(uint32_t i = 0; i < numElms; i++)
732 {
733 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
734 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
735
736 // extract values from constant mask
737 char valLow128bLane = (char)(cLow128b->getSExtValue());
738 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
739
740 Value* insertValLow128b;
741 Value* insertValHigh128b;
742
743 // if the mask value is negative, insert a '0' in the respective output position
744 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
745 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
746 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
747
748 vShuf = VINSERT(vShuf, insertValLow128b, i);
749 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
750 }
751 res = vShuf;
752 }
753 return res;
754 }
755
756 //////////////////////////////////////////////////////////////////////////
757 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
758 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
759 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
760 /// lower 8 values are used.
761 Value *Builder::PMOVSXBD(Value* a)
762 {
763 // llvm-3.9 removed the pmovsxbd intrinsic
764 #if HAVE_LLVM < 0x309
765 // use avx2 byte sign extend instruction if available
766 if(JM()->mArch.AVX2())
767 {
768 Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
769 return CALL(pmovsxbd, std::initializer_list<Value*>{a});
770 }
771 else
772 #endif
773 {
774 // VPMOVSXBD output type
775 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
776 // Extract 8 values from 128bit lane and sign extend
777 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
778 }
779 }
780
781 //////////////////////////////////////////////////////////////////////////
782 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
783 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
784 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
785 Value *Builder::PMOVSXWD(Value* a)
786 {
787 // llvm-3.9 removed the pmovsxwd intrinsic
788 #if HAVE_LLVM < 0x309
789 // use avx2 word sign extend if available
790 if(JM()->mArch.AVX2())
791 {
792 Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
793 return CALL(pmovsxwd, std::initializer_list<Value*>{a});
794 }
795 else
796 #endif
797 {
798 // VPMOVSXWD output type
799 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
800 // Extract 8 values from 128bit lane and sign extend
801 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
802 }
803 }
804
805 //////////////////////////////////////////////////////////////////////////
806 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
807 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
808 /// platform, emulate it
809 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
810 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
811 Value *Builder::PERMD(Value* a, Value* idx)
812 {
813 Value* res;
814 // use avx2 permute instruction if available
815 if(JM()->mArch.AVX2())
816 {
817 res = VPERMD(a, idx);
818 }
819 else
820 {
821 if (isa<Constant>(idx))
822 {
823 res = VSHUFFLE(a, a, idx);
824 }
825 else
826 {
827 res = VUNDEF_I();
828 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
829 {
830 Value* pIndex = VEXTRACT(idx, C(l));
831 Value* pVal = VEXTRACT(a, pIndex);
832 res = VINSERT(res, pVal, C(l));
833 }
834 }
835 }
836 return res;
837 }
838
839 //////////////////////////////////////////////////////////////////////////
840 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
841 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
842 /// platform, emulate it
843 /// @param a - 256bit SIMD lane(8x32bit) of float values.
844 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
845 Value *Builder::PERMPS(Value* a, Value* idx)
846 {
847 Value* res;
848 // use avx2 permute instruction if available
849 if (JM()->mArch.AVX2())
850 {
851 // llvm 3.6.0 swapped the order of the args to vpermd
852 res = VPERMPS(idx, a);
853 }
854 else
855 {
856 if (isa<Constant>(idx))
857 {
858 res = VSHUFFLE(a, a, idx);
859 }
860 else
861 {
862 res = VUNDEF_F();
863 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
864 {
865 Value* pIndex = VEXTRACT(idx, C(l));
866 Value* pVal = VEXTRACT(a, pIndex);
867 res = VINSERT(res, pVal, C(l));
868 }
869 }
870 }
871
872 return res;
873 }
874
875 //////////////////////////////////////////////////////////////////////////
876 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
877 /// in LLVM IR. If not supported on the underlying platform, emulate it
878 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
879 Value *Builder::CVTPH2PS(Value* a)
880 {
881 if (JM()->mArch.F16C())
882 {
883 return VCVTPH2PS(a);
884 }
885 else
886 {
887 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
888 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
889
890 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
891 {
892 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
893 }
894
895 Value* pResult = UndefValue::get(mSimdFP32Ty);
896 for (uint32_t i = 0; i < mVWidth; ++i)
897 {
898 Value* pSrc = VEXTRACT(a, C(i));
899 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
900 pResult = VINSERT(pResult, pConv, C(i));
901 }
902
903 return pResult;
904 }
905 }
906
907 //////////////////////////////////////////////////////////////////////////
908 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
909 /// in LLVM IR. If not supported on the underlying platform, emulate it
910 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
911 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
912 {
913 if (JM()->mArch.F16C())
914 {
915 return VCVTPS2PH(a, rounding);
916 }
917 else
918 {
919 // call scalar C function for now
920 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
921 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
922
923 if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
924 {
925 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
926 }
927
928 Value* pResult = UndefValue::get(mSimdInt16Ty);
929 for (uint32_t i = 0; i < mVWidth; ++i)
930 {
931 Value* pSrc = VEXTRACT(a, C(i));
932 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
933 pResult = VINSERT(pResult, pConv, C(i));
934 }
935
936 return pResult;
937 }
938 }
939
940 Value *Builder::PMAXSD(Value* a, Value* b)
941 {
942 // llvm-3.9 removed the pmax intrinsics
943 #if HAVE_LLVM >= 0x309
944 Value* cmp = ICMP_SGT(a, b);
945 return SELECT(cmp, a, b);
946 #else
947 if (JM()->mArch.AVX2())
948 {
949 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
950 return CALL(pmaxsd, {a, b});
951 }
952 else
953 {
954 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
955 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
956
957 // low 128
958 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
959 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
960 Value* resLo = CALL(pmaxsd, {aLo, bLo});
961
962 // high 128
963 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
964 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
965 Value* resHi = CALL(pmaxsd, {aHi, bHi});
966
967 // combine
968 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
969 result = VINSERTI128(result, resHi, C((uint8_t)1));
970
971 return result;
972 }
973 #endif
974 }
975
976 Value *Builder::PMINSD(Value* a, Value* b)
977 {
978 // llvm-3.9 removed the pmin intrinsics
979 #if HAVE_LLVM >= 0x309
980 Value* cmp = ICMP_SLT(a, b);
981 return SELECT(cmp, a, b);
982 #else
983 if (JM()->mArch.AVX2())
984 {
985 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
986 return CALL(pminsd, {a, b});
987 }
988 else
989 {
990 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
991 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
992
993 // low 128
994 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
995 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
996 Value* resLo = CALL(pminsd, {aLo, bLo});
997
998 // high 128
999 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
1000 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
1001 Value* resHi = CALL(pminsd, {aHi, bHi});
1002
1003 // combine
1004 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
1005 result = VINSERTI128(result, resHi, C((uint8_t)1));
1006
1007 return result;
1008 }
1009 #endif
1010 }
1011
1012 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1013 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1014 {
1015 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1016 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1017 {
1018 // ensure our mask is the correct type
1019 mask = BITCAST(mask, mSimdFP32Ty);
1020 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1021 }
1022 else
1023 {
1024 // ensure our mask is the correct type
1025 mask = BITCAST(mask, mSimdInt32Ty);
1026 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1027 }
1028 }
1029
1030 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1031 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1032 {
1033 switch(info.bpp / info.numComps)
1034 {
1035 case 16:
1036 {
1037 Value* vGatherResult[2];
1038 Value *vMask;
1039
1040 // TODO: vGatherMaskedVal
1041 Value* vGatherMaskedVal = VIMMED1((float)0);
1042
1043 // always have at least one component out of x or y to fetch
1044
1045 // save mask as it is zero'd out after each gather
1046 vMask = mask;
1047
1048 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1049 // e.g. result of first 8x32bit integer gather for 16bit components
1050 // 256i - 0 1 2 3 4 5 6 7
1051 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1052 //
1053
1054 // if we have at least one component out of x or y to fetch
1055 if(info.numComps > 2)
1056 {
1057 // offset base to the next components(zw) in the vertex to gather
1058 pSrcBase = GEP(pSrcBase, C((char)4));
1059 vMask = mask;
1060
1061 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1062 // e.g. result of second 8x32bit integer gather for 16bit components
1063 // 256i - 0 1 2 3 4 5 6 7
1064 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1065 //
1066 }
1067 else
1068 {
1069 vGatherResult[1] = vGatherMaskedVal;
1070 }
1071
1072 // Shuffle gathered components into place, each row is a component
1073 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1074 }
1075 break;
1076 case 32:
1077 {
1078 // apply defaults
1079 for (uint32_t i = 0; i < 4; ++i)
1080 {
1081 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1082 }
1083
1084 for(uint32_t i = 0; i < info.numComps; i++)
1085 {
1086 uint32_t swizzleIndex = info.swizzle[i];
1087
1088 // save mask as it is zero'd out after each gather
1089 Value *vMask = mask;
1090
1091 // Gather a SIMD of components
1092 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1093
1094 // offset base to the next component to gather
1095 pSrcBase = GEP(pSrcBase, C((char)4));
1096 }
1097 }
1098 break;
1099 default:
1100 SWR_INVALID("Invalid float format");
1101 break;
1102 }
1103 }
1104
1105 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1106 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1107 {
1108 switch (info.bpp / info.numComps)
1109 {
1110 case 8:
1111 {
1112 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1113 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1114 // e.g. result of an 8x32bit integer gather for 8bit components
1115 // 256i - 0 1 2 3 4 5 6 7
1116 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1117
1118 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1119 }
1120 break;
1121 case 16:
1122 {
1123 Value* vGatherResult[2];
1124 Value *vMask;
1125
1126 // TODO: vGatherMaskedVal
1127 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1128
1129 // always have at least one component out of x or y to fetch
1130
1131 // save mask as it is zero'd out after each gather
1132 vMask = mask;
1133
1134 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1135 // e.g. result of first 8x32bit integer gather for 16bit components
1136 // 256i - 0 1 2 3 4 5 6 7
1137 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1138 //
1139
1140 // if we have at least one component out of x or y to fetch
1141 if(info.numComps > 2)
1142 {
1143 // offset base to the next components(zw) in the vertex to gather
1144 pSrcBase = GEP(pSrcBase, C((char)4));
1145 vMask = mask;
1146
1147 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1148 // e.g. result of second 8x32bit integer gather for 16bit components
1149 // 256i - 0 1 2 3 4 5 6 7
1150 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1151 //
1152 }
1153 else
1154 {
1155 vGatherResult[1] = vGatherMaskedVal;
1156 }
1157
1158 // Shuffle gathered components into place, each row is a component
1159 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1160
1161 }
1162 break;
1163 case 32:
1164 {
1165 // apply defaults
1166 for (uint32_t i = 0; i < 4; ++i)
1167 {
1168 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1169 }
1170
1171 for(uint32_t i = 0; i < info.numComps; i++)
1172 {
1173 uint32_t swizzleIndex = info.swizzle[i];
1174
1175 // save mask as it is zero'd out after each gather
1176 Value *vMask = mask;
1177
1178 // Gather a SIMD of components
1179 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1180
1181 // offset base to the next component to gather
1182 pSrcBase = GEP(pSrcBase, C((char)4));
1183 }
1184 }
1185 break;
1186 default:
1187 SWR_INVALID("unsupported format");
1188 break;
1189 }
1190 }
1191
1192 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1193 {
1194 // cast types
1195 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1196 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1197
1198 // input could either be float or int vector; do shuffle work in int
1199 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1200 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1201
1202 if(bPackedOutput)
1203 {
1204 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1205
1206 // shuffle mask
1207 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1208 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1209 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1210 // after pshufb: group components together in each 128bit lane
1211 // 256i - 0 1 2 3 4 5 6 7
1212 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1213
1214 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1215 // after PERMD: move and pack xy components into each 128bit lane
1216 // 256i - 0 1 2 3 4 5 6 7
1217 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1218
1219 // do the same for zw components
1220 Value* vi128ZW = nullptr;
1221 if(info.numComps > 2)
1222 {
1223 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1224 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1225 }
1226
1227 for(uint32_t i = 0; i < 4; i++)
1228 {
1229 uint32_t swizzleIndex = info.swizzle[i];
1230 // todo: fixed for packed
1231 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1232 if(i >= info.numComps)
1233 {
1234 // set the default component val
1235 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1236 continue;
1237 }
1238
1239 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1240 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1241 // if x or y, use vi128XY permute result, else use vi128ZW
1242 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1243
1244 // extract packed component 128 bit lanes
1245 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1246 }
1247
1248 }
1249 else
1250 {
1251 // pshufb masks for each component
1252 Value* vConstMask[2];
1253 // x/z shuffle mask
1254 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1255 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1256
1257 // y/w shuffle mask
1258 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1259 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1260
1261
1262 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1263 // apply defaults
1264 for (uint32_t i = 0; i < 4; ++i)
1265 {
1266 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1267 }
1268
1269 for(uint32_t i = 0; i < info.numComps; i++)
1270 {
1271 uint32_t swizzleIndex = info.swizzle[i];
1272
1273 // select correct constMask for x/z or y/w pshufb
1274 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1275 // if x or y, use vi128XY permute result, else use vi128ZW
1276 uint32_t selectedGather = (i < 2) ? 0 : 1;
1277
1278 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1279 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1280 // 256i - 0 1 2 3 4 5 6 7
1281 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1282 }
1283 }
1284 }
1285
1286 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1287 {
1288 // cast types
1289 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1290 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1291
1292 if(bPackedOutput)
1293 {
1294 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1295 // shuffle mask
1296 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1297 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1298 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1299 // after pshufb: group components together in each 128bit lane
1300 // 256i - 0 1 2 3 4 5 6 7
1301 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1302
1303 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1304 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1305 // 256i - 0 1 2 3 4 5 6 7
1306 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1307
1308 // do the same for zw components
1309 Value* vi128ZW = nullptr;
1310 if(info.numComps > 2)
1311 {
1312 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1313 }
1314
1315 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1316 for(uint32_t i = 0; i < 4; i++)
1317 {
1318 uint32_t swizzleIndex = info.swizzle[i];
1319 // todo: fix for packed
1320 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1321 if(i >= info.numComps)
1322 {
1323 // set the default component val
1324 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1325 continue;
1326 }
1327
1328 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1329 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1330 // if x or y, use vi128XY permute result, else use vi128ZW
1331 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1332
1333 // sign extend
1334 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1335 }
1336 }
1337 // else zero extend
1338 else{
1339 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1340 // apply defaults
1341 for (uint32_t i = 0; i < 4; ++i)
1342 {
1343 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1344 }
1345
1346 for(uint32_t i = 0; i < info.numComps; i++){
1347 uint32_t swizzleIndex = info.swizzle[i];
1348
1349 // pshufb masks for each component
1350 Value* vConstMask;
1351 switch(i)
1352 {
1353 case 0:
1354 // x shuffle mask
1355 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1356 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1357 break;
1358 case 1:
1359 // y shuffle mask
1360 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1361 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1362 break;
1363 case 2:
1364 // z shuffle mask
1365 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1366 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1367 break;
1368 case 3:
1369 // w shuffle mask
1370 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1371 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1372 break;
1373 default:
1374 vConstMask = nullptr;
1375 break;
1376 }
1377
1378 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1379 // after pshufb for x channel
1380 // 256i - 0 1 2 3 4 5 6 7
1381 // x000 x000 x000 x000 x000 x000 x000 x000
1382 }
1383 }
1384 }
1385
1386 // Helper function to create alloca in entry block of function
1387 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1388 {
1389 auto saveIP = IRB()->saveIP();
1390 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1391 pFunc->getEntryBlock().begin());
1392 Value* pAlloca = ALLOCA(pType);
1393 IRB()->restoreIP(saveIP);
1394 return pAlloca;
1395 }
1396
1397 //////////////////////////////////////////////////////////////////////////
1398 /// @brief emulates a scatter operation.
1399 /// @param pDst - pointer to destination
1400 /// @param vSrc - vector of src data to scatter
1401 /// @param vOffsets - vector of byte offsets from pDst
1402 /// @param vMask - mask of valid lanes
1403 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1404 {
1405 /* Scatter algorithm
1406
1407 while(Index = BitScanForward(mask))
1408 srcElem = srcVector[Index]
1409 offsetElem = offsetVector[Index]
1410 *(pDst + offsetElem) = srcElem
1411 Update mask (&= ~(1<<Index)
1412
1413 */
1414
1415 BasicBlock* pCurBB = IRB()->GetInsertBlock();
1416 Function* pFunc = pCurBB->getParent();
1417 Type* pSrcTy = vSrc->getType()->getVectorElementType();
1418
1419 // Store vectors on stack
1420 if (pScatterStackSrc == nullptr)
1421 {
1422 // Save off stack allocations and reuse per scatter. Significantly reduces stack
1423 // requirements for shaders with a lot of scatters.
1424 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1425 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1426 }
1427
1428 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1429 Value* pOffsetsArrayPtr = pScatterStackOffsets;
1430 STORE(vSrc, pSrcArrayPtr);
1431 STORE(vOffsets, pOffsetsArrayPtr);
1432
1433 // Cast to pointers for random access
1434 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1435 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1436
1437 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1438
1439 // Get cttz function
1440 Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1441
1442 // Setup loop basic block
1443 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1444
1445 // compute first set bit
1446 Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1447
1448 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1449
1450 // Split current block
1451 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1452
1453 // Remove unconditional jump created by splitBasicBlock
1454 pCurBB->getTerminator()->eraseFromParent();
1455
1456 // Add terminator to end of original block
1457 IRB()->SetInsertPoint(pCurBB);
1458
1459 // Add conditional branch
1460 COND_BR(pIsUndef, pPostLoop, pLoop);
1461
1462 // Add loop basic block contents
1463 IRB()->SetInsertPoint(pLoop);
1464 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1465 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1466
1467 pIndexPhi->addIncoming(pIndex, pCurBB);
1468 pMaskPhi->addIncoming(pMask, pCurBB);
1469
1470 // Extract elements for this index
1471 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1472 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1473
1474 // GEP to this offset in dst
1475 Value* pCurDst = GEP(pDst, pOffsetElem);
1476 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1477 STORE(pSrcElem, pCurDst);
1478
1479 // Update the mask
1480 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1481
1482 // Terminator
1483 Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1484
1485 pIsUndef = ICMP_EQ(pNewIndex, C(32));
1486 COND_BR(pIsUndef, pPostLoop, pLoop);
1487
1488 // Update phi edges
1489 pIndexPhi->addIncoming(pNewIndex, pLoop);
1490 pMaskPhi->addIncoming(pNewMask, pLoop);
1491
1492 // Move builder to beginning of post loop
1493 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1494 }
1495
1496 Value* Builder::VABSPS(Value* a)
1497 {
1498 Value* asInt = BITCAST(a, mSimdInt32Ty);
1499 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1500 return result;
1501 }
1502
1503 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1504 {
1505 Value *lowCmp = ICMP_SLT(src, low);
1506 Value *ret = SELECT(lowCmp, low, src);
1507
1508 Value *highCmp = ICMP_SGT(ret, high);
1509 ret = SELECT(highCmp, high, ret);
1510
1511 return ret;
1512 }
1513
1514 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1515 {
1516 Value *lowCmp = FCMP_OLT(src, low);
1517 Value *ret = SELECT(lowCmp, low, src);
1518
1519 Value *highCmp = FCMP_OGT(ret, high);
1520 ret = SELECT(highCmp, high, ret);
1521
1522 return ret;
1523 }
1524
1525 Value *Builder::FCLAMP(Value* src, float low, float high)
1526 {
1527 Value* result = VMAXPS(src, VIMMED1(low));
1528 result = VMINPS(result, VIMMED1(high));
1529
1530 return result;
1531 }
1532
1533 //////////////////////////////////////////////////////////////////////////
1534 /// @brief save/restore stack, providing ability to push/pop the stack and
1535 /// reduce overall stack requirements for temporary stack use
1536 Value* Builder::STACKSAVE()
1537 {
1538 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1539 #if HAVE_LLVM == 0x306
1540 return CALL(pfnStackSave);
1541 #else
1542 return CALLA(pfnStackSave);
1543 #endif
1544 }
1545
1546 void Builder::STACKRESTORE(Value* pSaved)
1547 {
1548 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1549 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1550 }
1551
1552 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1553 {
1554 Value* vOut;
1555 // use FMADs if available
1556 if(JM()->mArch.AVX2())
1557 {
1558 vOut = VFMADDPS(a, b, c);
1559 }
1560 else
1561 {
1562 vOut = FADD(FMUL(a, b), c);
1563 }
1564 return vOut;
1565 }
1566
1567 Value* Builder::POPCNT(Value* a)
1568 {
1569 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1570 return CALL(pCtPop, std::initializer_list<Value*>{a});
1571 }
1572
1573 //////////////////////////////////////////////////////////////////////////
1574 /// @brief C functions called by LLVM IR
1575 //////////////////////////////////////////////////////////////////////////
1576
1577 //////////////////////////////////////////////////////////////////////////
1578 /// @brief called in JIT code, inserted by PRINT
1579 /// output to both stdout and visual studio debug console
1580 void __cdecl CallPrint(const char* fmt, ...)
1581 {
1582 va_list args;
1583 va_start(args, fmt);
1584 vprintf(fmt, args);
1585
1586 #if defined( _WIN32 )
1587 char strBuf[1024];
1588 vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1589 OutputDebugString(strBuf);
1590 #endif
1591
1592 va_end(args);
1593 }
1594
1595 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1596 {
1597 #if HAVE_LLVM == 0x306
1598 Function *func =
1599 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1600 Intrinsic::x86_avx_vextractf128_si_256);
1601 return CALL(func, {a, imm8});
1602 #else
1603 bool flag = !imm8->isZeroValue();
1604 SmallVector<Constant*,8> idx;
1605 for (unsigned i = 0; i < mVWidth / 2; i++) {
1606 idx.push_back(C(flag ? i + mVWidth / 2 : i));
1607 }
1608 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1609 #endif
1610 }
1611
1612 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1613 {
1614 #if HAVE_LLVM == 0x306
1615 Function *func =
1616 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1617 Intrinsic::x86_avx_vinsertf128_si_256);
1618 return CALL(func, {a, b, imm8});
1619 #else
1620 bool flag = !imm8->isZeroValue();
1621 SmallVector<Constant*,8> idx;
1622 for (unsigned i = 0; i < mVWidth; i++) {
1623 idx.push_back(C(i));
1624 }
1625 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1626
1627 SmallVector<Constant*,8> idx2;
1628 for (unsigned i = 0; i < mVWidth / 2; i++) {
1629 idx2.push_back(C(flag ? i : i + mVWidth));
1630 }
1631 for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1632 idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1633 }
1634 return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1635 #endif
1636 }
1637
1638 // rdtsc buckets macros
1639 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1640 {
1641 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1642 // buckets framework when single threaded
1643 if (KNOB_SINGLE_THREADED)
1644 {
1645 std::vector<Type*> args{
1646 PointerType::get(mInt32Ty, 0), // pBucketMgr
1647 mInt32Ty // id
1648 };
1649
1650 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1651 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1652 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1653 {
1654 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1655 }
1656
1657 CALL(pFunc, { pBucketMgr, pId });
1658 }
1659 }
1660
1661 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1662 {
1663 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1664 // buckets framework when single threaded
1665 if (KNOB_SINGLE_THREADED)
1666 {
1667 std::vector<Type*> args{
1668 PointerType::get(mInt32Ty, 0), // pBucketMgr
1669 mInt32Ty // id
1670 };
1671
1672 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1673 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1674 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1675 {
1676 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1677 }
1678
1679 CALL(pFunc, { pBucketMgr, pId });
1680 }
1681 }
1682
1683 }