swr/rast: stop using MSFT types in platform independent code
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "common/rdtsc_buckets.h"
32
33 #include <cstdarg>
34
35 namespace SwrJit
36 {
37 void __cdecl CallPrint(const char* fmt, ...);
38
39 //////////////////////////////////////////////////////////////////////////
40 /// @brief Convert an IEEE 754 32-bit single precision float to an
41 /// 16 bit float with 5 exponent bits and a variable
42 /// number of mantissa bits.
43 /// @param val - 32-bit float
44 /// @todo Maybe move this outside of this file into a header?
45 static uint16_t Convert32To16Float(float val)
46 {
47 uint32_t sign, exp, mant;
48 uint32_t roundBits;
49
50 // Extract the sign, exponent, and mantissa
51 uint32_t uf = *(uint32_t*)&val;
52 sign = (uf & 0x80000000) >> 31;
53 exp = (uf & 0x7F800000) >> 23;
54 mant = uf & 0x007FFFFF;
55
56 // Check for out of range
57 if (std::isnan(val))
58 {
59 exp = 0x1F;
60 mant = 0x200;
61 sign = 1; // set the sign bit for NANs
62 }
63 else if (std::isinf(val))
64 {
65 exp = 0x1f;
66 mant = 0x0;
67 }
68 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
69 {
70 exp = 0x1E;
71 mant = 0x3FF;
72 }
73 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
74 {
75 mant |= 0x00800000;
76 for (; exp <= 0x70; mant >>= 1, exp++)
77 ;
78 exp = 0;
79 mant = mant >> 13;
80 }
81 else if (exp < 0x66) // Too small to represent -> Zero
82 {
83 exp = 0;
84 mant = 0;
85 }
86 else
87 {
88 // Saves bits that will be shifted off for rounding
89 roundBits = mant & 0x1FFFu;
90 // convert exponent and mantissa to 16 bit format
91 exp = exp - 0x70;
92 mant = mant >> 13;
93
94 // Essentially RTZ, but round up if off by only 1 lsb
95 if (roundBits == 0x1FFFu)
96 {
97 mant++;
98 // check for overflow
99 if ((mant & 0xC00u) != 0)
100 exp++;
101 // make sure only the needed bits are used
102 mant &= 0x3FF;
103 }
104 }
105
106 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
107 return (uint16_t)tmpVal;
108 }
109
110 //////////////////////////////////////////////////////////////////////////
111 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
112 /// float
113 /// @param val - 16-bit float
114 /// @todo Maybe move this outside of this file into a header?
115 static float ConvertSmallFloatTo32(uint32_t val)
116 {
117 uint32_t result;
118 if ((val & 0x7fff) == 0)
119 {
120 result = ((uint32_t)(val & 0x8000)) << 16;
121 }
122 else if ((val & 0x7c00) == 0x7c00)
123 {
124 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
125 result |= ((uint32_t)val & 0x8000) << 16;
126 }
127 else
128 {
129 uint32_t sign = (val & 0x8000) << 16;
130 uint32_t mant = (val & 0x3ff) << 13;
131 uint32_t exp = (val >> 10) & 0x1f;
132 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
133 {
134 mant <<= 1;
135 while (mant < (0x400 << 13))
136 {
137 exp--;
138 mant <<= 1;
139 }
140 mant &= (0x3ff << 13);
141 }
142 exp = ((exp - 15 + 127) & 0xff) << 23;
143 result = sign | exp | mant;
144 }
145
146 return *(float*)&result;
147 }
148
149 Constant *Builder::C(bool i)
150 {
151 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
152 }
153
154 Constant *Builder::C(char i)
155 {
156 return ConstantInt::get(IRB()->getInt8Ty(), i);
157 }
158
159 Constant *Builder::C(uint8_t i)
160 {
161 return ConstantInt::get(IRB()->getInt8Ty(), i);
162 }
163
164 Constant *Builder::C(int i)
165 {
166 return ConstantInt::get(IRB()->getInt32Ty(), i);
167 }
168
169 Constant *Builder::C(int64_t i)
170 {
171 return ConstantInt::get(IRB()->getInt64Ty(), i);
172 }
173
174 Constant *Builder::C(uint16_t i)
175 {
176 return ConstantInt::get(mInt16Ty,i);
177 }
178
179 Constant *Builder::C(uint32_t i)
180 {
181 return ConstantInt::get(IRB()->getInt32Ty(), i);
182 }
183
184 Constant *Builder::C(float i)
185 {
186 return ConstantFP::get(IRB()->getFloatTy(), i);
187 }
188
189 Constant *Builder::PRED(bool pred)
190 {
191 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
192 }
193
194 Value *Builder::VIMMED1(int i)
195 {
196 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
197 }
198
199 Value *Builder::VIMMED1(uint32_t i)
200 {
201 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
202 }
203
204 Value *Builder::VIMMED1(float i)
205 {
206 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
207 }
208
209 Value *Builder::VIMMED1(bool i)
210 {
211 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
212 }
213
214 Value *Builder::VUNDEF_IPTR()
215 {
216 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
217 }
218
219 Value *Builder::VUNDEF_I()
220 {
221 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
222 }
223
224 Value *Builder::VUNDEF(Type *ty, uint32_t size)
225 {
226 return UndefValue::get(VectorType::get(ty, size));
227 }
228
229 Value *Builder::VUNDEF_F()
230 {
231 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
232 }
233
234 Value *Builder::VUNDEF(Type* t)
235 {
236 return UndefValue::get(VectorType::get(t, mVWidth));
237 }
238
239 Value *Builder::VBROADCAST(Value *src)
240 {
241 // check if src is already a vector
242 if (src->getType()->isVectorTy())
243 {
244 return src;
245 }
246
247 return VECTOR_SPLAT(mVWidth, src);
248 }
249
250 uint32_t Builder::IMMED(Value* v)
251 {
252 SWR_ASSERT(isa<ConstantInt>(v));
253 ConstantInt *pValConst = cast<ConstantInt>(v);
254 return pValConst->getZExtValue();
255 }
256
257 int32_t Builder::S_IMMED(Value* v)
258 {
259 SWR_ASSERT(isa<ConstantInt>(v));
260 ConstantInt *pValConst = cast<ConstantInt>(v);
261 return pValConst->getSExtValue();
262 }
263
264 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
265 {
266 std::vector<Value*> indices;
267 for (auto i : indexList)
268 indices.push_back(i);
269 return GEPA(ptr, indices);
270 }
271
272 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
273 {
274 std::vector<Value*> indices;
275 for (auto i : indexList)
276 indices.push_back(C(i));
277 return GEPA(ptr, indices);
278 }
279
280 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
281 {
282 std::vector<Value*> indices;
283 for (auto i : indexList)
284 indices.push_back(i);
285 return IN_BOUNDS_GEP(ptr, indices);
286 }
287
288 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
289 {
290 std::vector<Value*> indices;
291 for (auto i : indexList)
292 indices.push_back(C(i));
293 return IN_BOUNDS_GEP(ptr, indices);
294 }
295
296 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
297 {
298 std::vector<Value*> valIndices;
299 for (auto i : indices)
300 valIndices.push_back(C(i));
301 return LOAD(GEPA(basePtr, valIndices), name);
302 }
303
304 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
305 {
306 std::vector<Value*> valIndices;
307 for (auto i : indices)
308 valIndices.push_back(i);
309 return LOAD(GEPA(basePtr, valIndices), name);
310 }
311
312 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
313 {
314 std::vector<Value*> valIndices;
315 for (auto i : indices)
316 valIndices.push_back(C(i));
317 return STORE(val, GEPA(basePtr, valIndices));
318 }
319
320 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
321 {
322 std::vector<Value*> valIndices;
323 for (auto i : indices)
324 valIndices.push_back(i);
325 return STORE(val, GEPA(basePtr, valIndices));
326 }
327
328 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
329 {
330 std::vector<Value*> args;
331 for (auto arg : argsList)
332 args.push_back(arg);
333 return CALLA(Callee, args);
334 }
335
336 CallInst *Builder::CALL(Value *Callee, Value* arg)
337 {
338 std::vector<Value*> args;
339 args.push_back(arg);
340 return CALLA(Callee, args);
341 }
342
343 CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
344 {
345 std::vector<Value*> args;
346 args.push_back(arg1);
347 args.push_back(arg2);
348 return CALLA(Callee, args);
349 }
350
351 CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
352 {
353 std::vector<Value*> args;
354 args.push_back(arg1);
355 args.push_back(arg2);
356 args.push_back(arg3);
357 return CALLA(Callee, args);
358 }
359
360 //////////////////////////////////////////////////////////////////////////
361 Value *Builder::DEBUGTRAP()
362 {
363 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
364 return CALL(func);
365 }
366
367 Value *Builder::VRCP(Value *va)
368 {
369 return FDIV(VIMMED1(1.0f), va); // 1 / a
370 }
371
372 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
373 {
374 Value* vOut = FMADDPS(vA, vX, vC);
375 vOut = FMADDPS(vB, vY, vOut);
376 return vOut;
377 }
378
379 //////////////////////////////////////////////////////////////////////////
380 /// @brief Generate an i32 masked load operation in LLVM IR. If not
381 /// supported on the underlying platform, emulate it with float masked load
382 /// @param src - base address pointer for the load
383 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
384 Value *Builder::MASKLOADD(Value* src,Value* mask)
385 {
386 Value* vResult;
387 // use avx2 gather instruction is available
388 if(JM()->mArch.AVX2())
389 {
390 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
391 vResult = CALL(func,{src,mask});
392 }
393 else
394 {
395 // maskload intrinsic expects integer mask operand in llvm >= 3.8
396 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
397 mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
398 #else
399 mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
400 #endif
401 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
402 vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
403 }
404 return vResult;
405 }
406
407 //////////////////////////////////////////////////////////////////////////
408 /// @brief insert a JIT call to CallPrint
409 /// - outputs formatted string to both stdout and VS output window
410 /// - DEBUG builds only
411 /// Usage example:
412 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
413 /// where C(lane) creates a constant value to print, and pIndex is the Value*
414 /// result from a GEP, printing out the pointer to memory
415 /// @param printStr - constant string to print, which includes format specifiers
416 /// @param printArgs - initializer list of Value*'s to print to std out
417 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
418 {
419 // push the arguments to CallPrint into a vector
420 std::vector<Value*> printCallArgs;
421 // save room for the format string. we still need to modify it for vectors
422 printCallArgs.resize(1);
423
424 // search through the format string for special processing
425 size_t pos = 0;
426 std::string tempStr(printStr);
427 pos = tempStr.find('%', pos);
428 auto v = printArgs.begin();
429
430 while ((pos != std::string::npos) && (v != printArgs.end()))
431 {
432 Value* pArg = *v;
433 Type* pType = pArg->getType();
434
435 if (pType->isVectorTy())
436 {
437 Type* pContainedType = pType->getContainedType(0);
438
439 if (toupper(tempStr[pos + 1]) == 'X')
440 {
441 tempStr[pos] = '0';
442 tempStr[pos + 1] = 'x';
443 tempStr.insert(pos + 2, "%08X ");
444 pos += 7;
445
446 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
447
448 std::string vectorFormatStr;
449 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
450 {
451 vectorFormatStr += "0x%08X ";
452 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
453 }
454
455 tempStr.insert(pos, vectorFormatStr);
456 pos += vectorFormatStr.size();
457 }
458 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
459 {
460 uint32_t i = 0;
461 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
462 {
463 tempStr.insert(pos, std::string("%f "));
464 pos += 3;
465 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
466 }
467 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
468 }
469 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
470 {
471 uint32_t i = 0;
472 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
473 {
474 tempStr.insert(pos, std::string("%d "));
475 pos += 3;
476 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
477 }
478 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
479 }
480 }
481 else
482 {
483 if (toupper(tempStr[pos + 1]) == 'X')
484 {
485 tempStr[pos] = '0';
486 tempStr.insert(pos + 1, "x%08");
487 printCallArgs.push_back(pArg);
488 pos += 3;
489 }
490 // for %f we need to cast float Values to doubles so that they print out correctly
491 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
492 {
493 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
494 pos++;
495 }
496 else
497 {
498 printCallArgs.push_back(pArg);
499 }
500 }
501
502 // advance to the next arguement
503 v++;
504 pos = tempStr.find('%', ++pos);
505 }
506
507 // create global variable constant string
508 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
509 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
510 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
511
512 // get a pointer to the first character in the constant string array
513 std::vector<Constant*> geplist{C(0),C(0)};
514 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
515
516 // insert the pointer to the format string in the argument vector
517 printCallArgs[0] = strGEP;
518
519 // get pointer to CallPrint function and insert decl into the module if needed
520 std::vector<Type*> args;
521 args.push_back(PointerType::get(mInt8Ty,0));
522 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
523 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
524
525 // if we haven't yet added the symbol to the symbol table
526 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
527 {
528 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
529 }
530
531 // insert a call to CallPrint
532 return CALLA(callPrintFn,printCallArgs);
533 }
534
535 //////////////////////////////////////////////////////////////////////////
536 /// @brief Wrapper around PRINT with initializer list.
537 CallInst* Builder::PRINT(const std::string &printStr)
538 {
539 return PRINT(printStr, {});
540 }
541
542 //////////////////////////////////////////////////////////////////////////
543 /// @brief Generate a masked gather operation in LLVM IR. If not
544 /// supported on the underlying platform, emulate it with loads
545 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
546 /// @param pBase - Int8* base VB address pointer value
547 /// @param vIndices - SIMD wide value of VB byte offsets
548 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
549 /// @param scale - value to scale indices by
550 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
551 {
552 Value* vGather;
553
554 // use avx2 gather instruction if available
555 if(JM()->mArch.AVX2())
556 {
557 // force mask to <N x float>, required by vgather
558 vMask = BITCAST(vMask, mSimdFP32Ty);
559 vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
560 }
561 else
562 {
563 Value* pStack = STACKSAVE();
564
565 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
566 Value* vSrcPtr = ALLOCA(vSrc->getType());
567 STORE(vSrc, vSrcPtr);
568
569 vGather = VUNDEF_F();
570 Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
571 Value *vOffsets = MUL(vIndices,vScaleVec);
572 Value *mask = MASK(vMask);
573 for(uint32_t i = 0; i < mVWidth; ++i)
574 {
575 // single component byte index
576 Value *offset = VEXTRACT(vOffsets,C(i));
577 // byte pointer to component
578 Value *loadAddress = GEP(pBase,offset);
579 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
580 // pointer to the value to load if we're masking off a component
581 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
582 Value *selMask = VEXTRACT(mask,C(i));
583 // switch in a safe address to load if we're trying to access a vertex
584 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
585 Value *val = LOAD(validAddress);
586 vGather = VINSERT(vGather,val,C(i));
587 }
588 STACKRESTORE(pStack);
589 }
590
591 return vGather;
592 }
593
594 //////////////////////////////////////////////////////////////////////////
595 /// @brief Generate a masked gather operation in LLVM IR. If not
596 /// supported on the underlying platform, emulate it with loads
597 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
598 /// @param pBase - Int8* base VB address pointer value
599 /// @param vIndices - SIMD wide value of VB byte offsets
600 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
601 /// @param scale - value to scale indices by
602 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
603 {
604 Value* vGather;
605
606 // use avx2 gather instruction if available
607 if(JM()->mArch.AVX2())
608 {
609 vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
610 }
611 else
612 {
613 Value* pStack = STACKSAVE();
614
615 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
616 Value* vSrcPtr = ALLOCA(vSrc->getType());
617 STORE(vSrc, vSrcPtr);
618
619 vGather = VUNDEF_I();
620 Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
621 Value *vOffsets = MUL(vIndices, vScaleVec);
622 Value *mask = MASK(vMask);
623 for(uint32_t i = 0; i < mVWidth; ++i)
624 {
625 // single component byte index
626 Value *offset = VEXTRACT(vOffsets, C(i));
627 // byte pointer to component
628 Value *loadAddress = GEP(pBase, offset);
629 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
630 // pointer to the value to load if we're masking off a component
631 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
632 Value *selMask = VEXTRACT(mask, C(i));
633 // switch in a safe address to load if we're trying to access a vertex
634 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
635 Value *val = LOAD(validAddress, C(0));
636 vGather = VINSERT(vGather, val, C(i));
637 }
638
639 STACKRESTORE(pStack);
640 }
641 return vGather;
642 }
643
644 //////////////////////////////////////////////////////////////////////////
645 /// @brief Generate a masked gather operation in LLVM IR. If not
646 /// supported on the underlying platform, emulate it with loads
647 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
648 /// @param pBase - Int8* base VB address pointer value
649 /// @param vIndices - SIMD wide value of VB byte offsets
650 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
651 /// @param scale - value to scale indices by
652 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
653 {
654 Value* vGather;
655
656 // use avx2 gather instruction if available
657 if(JM()->mArch.AVX2())
658 {
659 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale);
660 }
661 else
662 {
663 Value* pStack = STACKSAVE();
664
665 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
666 Value* vSrcPtr = ALLOCA(vSrc->getType());
667 STORE(vSrc, vSrcPtr);
668
669 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
670 Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty));
671 Value *vOffsets = MUL(vIndices,vScaleVec);
672 Value *mask = MASK(vMask);
673 for(uint32_t i = 0; i < mVWidth/2; ++i)
674 {
675 // single component byte index
676 Value *offset = VEXTRACT(vOffsets,C(i));
677 // byte pointer to component
678 Value *loadAddress = GEP(pBase,offset);
679 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
680 // pointer to the value to load if we're masking off a component
681 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
682 Value *selMask = VEXTRACT(mask,C(i));
683 // switch in a safe address to load if we're trying to access a vertex
684 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
685 Value *val = LOAD(validAddress);
686 vGather = VINSERT(vGather,val,C(i));
687 }
688 STACKRESTORE(pStack);
689 }
690 return vGather;
691 }
692
693 //////////////////////////////////////////////////////////////////////////
694 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
695 Value* Builder::MASK(Value* vmask)
696 {
697 Value* src = BITCAST(vmask, mSimdInt32Ty);
698 return ICMP_SLT(src, VIMMED1(0));
699 }
700
701 //////////////////////////////////////////////////////////////////////////
702 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
703 Value* Builder::VMASK(Value* mask)
704 {
705 return S_EXT(mask, mSimdInt32Ty);
706 }
707
708 //////////////////////////////////////////////////////////////////////////
709 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
710 /// supported on the underlying platform, emulate it
711 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
712 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
713 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
714 /// 128bits of a, and vice versa for the upper lanes. If the mask
715 /// value is negative, '0' is inserted.
716 Value *Builder::PSHUFB(Value* a, Value* b)
717 {
718 Value* res;
719 // use avx2 pshufb instruction if available
720 if(JM()->mArch.AVX2())
721 {
722 res = VPSHUFB(a, b);
723 }
724 else
725 {
726 Constant* cB = dyn_cast<Constant>(b);
727 // number of 8 bit elements in b
728 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
729 // output vector
730 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
731
732 // insert an 8 bit value from the high and low lanes of a per loop iteration
733 numElms /= 2;
734 for(uint32_t i = 0; i < numElms; i++)
735 {
736 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
737 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
738
739 // extract values from constant mask
740 char valLow128bLane = (char)(cLow128b->getSExtValue());
741 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
742
743 Value* insertValLow128b;
744 Value* insertValHigh128b;
745
746 // if the mask value is negative, insert a '0' in the respective output position
747 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
748 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
749 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
750
751 vShuf = VINSERT(vShuf, insertValLow128b, i);
752 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
753 }
754 res = vShuf;
755 }
756 return res;
757 }
758
759 //////////////////////////////////////////////////////////////////////////
760 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
761 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
762 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
763 /// lower 8 values are used.
764 Value *Builder::PMOVSXBD(Value* a)
765 {
766 // llvm-3.9 removed the pmovsxbd intrinsic
767 #if HAVE_LLVM < 0x309
768 // use avx2 byte sign extend instruction if available
769 if(JM()->mArch.AVX2())
770 {
771 Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
772 return CALL(pmovsxbd, std::initializer_list<Value*>{a});
773 }
774 else
775 #endif
776 {
777 // VPMOVSXBD output type
778 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
779 // Extract 8 values from 128bit lane and sign extend
780 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
781 }
782 }
783
784 //////////////////////////////////////////////////////////////////////////
785 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
786 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
787 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
788 Value *Builder::PMOVSXWD(Value* a)
789 {
790 // llvm-3.9 removed the pmovsxwd intrinsic
791 #if HAVE_LLVM < 0x309
792 // use avx2 word sign extend if available
793 if(JM()->mArch.AVX2())
794 {
795 Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
796 return CALL(pmovsxwd, std::initializer_list<Value*>{a});
797 }
798 else
799 #endif
800 {
801 // VPMOVSXWD output type
802 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
803 // Extract 8 values from 128bit lane and sign extend
804 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
805 }
806 }
807
808 //////////////////////////////////////////////////////////////////////////
809 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
810 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
811 /// platform, emulate it
812 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
813 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
814 Value *Builder::PERMD(Value* a, Value* idx)
815 {
816 Value* res;
817 // use avx2 permute instruction if available
818 if(JM()->mArch.AVX2())
819 {
820 res = VPERMD(a, idx);
821 }
822 else
823 {
824 if (isa<Constant>(idx))
825 {
826 res = VSHUFFLE(a, a, idx);
827 }
828 else
829 {
830 res = VUNDEF_I();
831 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
832 {
833 Value* pIndex = VEXTRACT(idx, C(l));
834 Value* pVal = VEXTRACT(a, pIndex);
835 res = VINSERT(res, pVal, C(l));
836 }
837 }
838 }
839 return res;
840 }
841
842 //////////////////////////////////////////////////////////////////////////
843 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
844 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
845 /// platform, emulate it
846 /// @param a - 256bit SIMD lane(8x32bit) of float values.
847 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
848 Value *Builder::PERMPS(Value* a, Value* idx)
849 {
850 Value* res;
851 // use avx2 permute instruction if available
852 if (JM()->mArch.AVX2())
853 {
854 // llvm 3.6.0 swapped the order of the args to vpermd
855 res = VPERMPS(idx, a);
856 }
857 else
858 {
859 if (isa<Constant>(idx))
860 {
861 res = VSHUFFLE(a, a, idx);
862 }
863 else
864 {
865 res = VUNDEF_F();
866 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
867 {
868 Value* pIndex = VEXTRACT(idx, C(l));
869 Value* pVal = VEXTRACT(a, pIndex);
870 res = VINSERT(res, pVal, C(l));
871 }
872 }
873 }
874
875 return res;
876 }
877
878 //////////////////////////////////////////////////////////////////////////
879 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
880 /// in LLVM IR. If not supported on the underlying platform, emulate it
881 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
882 Value *Builder::CVTPH2PS(Value* a)
883 {
884 if (JM()->mArch.F16C())
885 {
886 return VCVTPH2PS(a);
887 }
888 else
889 {
890 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
891 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
892
893 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
894 {
895 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
896 }
897
898 Value* pResult = UndefValue::get(mSimdFP32Ty);
899 for (uint32_t i = 0; i < mVWidth; ++i)
900 {
901 Value* pSrc = VEXTRACT(a, C(i));
902 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
903 pResult = VINSERT(pResult, pConv, C(i));
904 }
905
906 return pResult;
907 }
908 }
909
910 //////////////////////////////////////////////////////////////////////////
911 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
912 /// in LLVM IR. If not supported on the underlying platform, emulate it
913 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
914 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
915 {
916 if (JM()->mArch.F16C())
917 {
918 return VCVTPS2PH(a, rounding);
919 }
920 else
921 {
922 // call scalar C function for now
923 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
924 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
925
926 if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
927 {
928 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
929 }
930
931 Value* pResult = UndefValue::get(mSimdInt16Ty);
932 for (uint32_t i = 0; i < mVWidth; ++i)
933 {
934 Value* pSrc = VEXTRACT(a, C(i));
935 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
936 pResult = VINSERT(pResult, pConv, C(i));
937 }
938
939 return pResult;
940 }
941 }
942
943 Value *Builder::PMAXSD(Value* a, Value* b)
944 {
945 // llvm-3.9 removed the pmax intrinsics
946 #if HAVE_LLVM >= 0x309
947 Value* cmp = ICMP_SGT(a, b);
948 return SELECT(cmp, a, b);
949 #else
950 if (JM()->mArch.AVX2())
951 {
952 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
953 return CALL(pmaxsd, {a, b});
954 }
955 else
956 {
957 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
958 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
959
960 // low 128
961 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
962 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
963 Value* resLo = CALL(pmaxsd, {aLo, bLo});
964
965 // high 128
966 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
967 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
968 Value* resHi = CALL(pmaxsd, {aHi, bHi});
969
970 // combine
971 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
972 result = VINSERTI128(result, resHi, C((uint8_t)1));
973
974 return result;
975 }
976 #endif
977 }
978
979 Value *Builder::PMINSD(Value* a, Value* b)
980 {
981 // llvm-3.9 removed the pmin intrinsics
982 #if HAVE_LLVM >= 0x309
983 Value* cmp = ICMP_SLT(a, b);
984 return SELECT(cmp, a, b);
985 #else
986 if (JM()->mArch.AVX2())
987 {
988 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
989 return CALL(pminsd, {a, b});
990 }
991 else
992 {
993 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
994 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
995
996 // low 128
997 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
998 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
999 Value* resLo = CALL(pminsd, {aLo, bLo});
1000
1001 // high 128
1002 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
1003 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
1004 Value* resHi = CALL(pminsd, {aHi, bHi});
1005
1006 // combine
1007 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
1008 result = VINSERTI128(result, resHi, C((uint8_t)1));
1009
1010 return result;
1011 }
1012 #endif
1013 }
1014
1015 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1016 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1017 {
1018 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1019 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1020 {
1021 // ensure our mask is the correct type
1022 mask = BITCAST(mask, mSimdFP32Ty);
1023 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1024 }
1025 else
1026 {
1027 // ensure our mask is the correct type
1028 mask = BITCAST(mask, mSimdInt32Ty);
1029 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1030 }
1031 }
1032
1033 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1034 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1035 {
1036 switch(info.bpp / info.numComps)
1037 {
1038 case 16:
1039 {
1040 Value* vGatherResult[2];
1041 Value *vMask;
1042
1043 // TODO: vGatherMaskedVal
1044 Value* vGatherMaskedVal = VIMMED1((float)0);
1045
1046 // always have at least one component out of x or y to fetch
1047
1048 // save mask as it is zero'd out after each gather
1049 vMask = mask;
1050
1051 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1052 // e.g. result of first 8x32bit integer gather for 16bit components
1053 // 256i - 0 1 2 3 4 5 6 7
1054 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1055 //
1056
1057 // if we have at least one component out of x or y to fetch
1058 if(info.numComps > 2)
1059 {
1060 // offset base to the next components(zw) in the vertex to gather
1061 pSrcBase = GEP(pSrcBase, C((char)4));
1062 vMask = mask;
1063
1064 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1065 // e.g. result of second 8x32bit integer gather for 16bit components
1066 // 256i - 0 1 2 3 4 5 6 7
1067 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1068 //
1069 }
1070 else
1071 {
1072 vGatherResult[1] = vGatherMaskedVal;
1073 }
1074
1075 // Shuffle gathered components into place, each row is a component
1076 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1077 }
1078 break;
1079 case 32:
1080 {
1081 // apply defaults
1082 for (uint32_t i = 0; i < 4; ++i)
1083 {
1084 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1085 }
1086
1087 for(uint32_t i = 0; i < info.numComps; i++)
1088 {
1089 uint32_t swizzleIndex = info.swizzle[i];
1090
1091 // save mask as it is zero'd out after each gather
1092 Value *vMask = mask;
1093
1094 // Gather a SIMD of components
1095 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1096
1097 // offset base to the next component to gather
1098 pSrcBase = GEP(pSrcBase, C((char)4));
1099 }
1100 }
1101 break;
1102 default:
1103 SWR_INVALID("Invalid float format");
1104 break;
1105 }
1106 }
1107
1108 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1109 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1110 {
1111 switch (info.bpp / info.numComps)
1112 {
1113 case 8:
1114 {
1115 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1116 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1117 // e.g. result of an 8x32bit integer gather for 8bit components
1118 // 256i - 0 1 2 3 4 5 6 7
1119 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1120
1121 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1122 }
1123 break;
1124 case 16:
1125 {
1126 Value* vGatherResult[2];
1127 Value *vMask;
1128
1129 // TODO: vGatherMaskedVal
1130 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1131
1132 // always have at least one component out of x or y to fetch
1133
1134 // save mask as it is zero'd out after each gather
1135 vMask = mask;
1136
1137 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1138 // e.g. result of first 8x32bit integer gather for 16bit components
1139 // 256i - 0 1 2 3 4 5 6 7
1140 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1141 //
1142
1143 // if we have at least one component out of x or y to fetch
1144 if(info.numComps > 2)
1145 {
1146 // offset base to the next components(zw) in the vertex to gather
1147 pSrcBase = GEP(pSrcBase, C((char)4));
1148 vMask = mask;
1149
1150 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1151 // e.g. result of second 8x32bit integer gather for 16bit components
1152 // 256i - 0 1 2 3 4 5 6 7
1153 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1154 //
1155 }
1156 else
1157 {
1158 vGatherResult[1] = vGatherMaskedVal;
1159 }
1160
1161 // Shuffle gathered components into place, each row is a component
1162 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1163
1164 }
1165 break;
1166 case 32:
1167 {
1168 // apply defaults
1169 for (uint32_t i = 0; i < 4; ++i)
1170 {
1171 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1172 }
1173
1174 for(uint32_t i = 0; i < info.numComps; i++)
1175 {
1176 uint32_t swizzleIndex = info.swizzle[i];
1177
1178 // save mask as it is zero'd out after each gather
1179 Value *vMask = mask;
1180
1181 // Gather a SIMD of components
1182 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1183
1184 // offset base to the next component to gather
1185 pSrcBase = GEP(pSrcBase, C((char)4));
1186 }
1187 }
1188 break;
1189 default:
1190 SWR_INVALID("unsupported format");
1191 break;
1192 }
1193 }
1194
1195 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1196 {
1197 // cast types
1198 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1199 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1200
1201 // input could either be float or int vector; do shuffle work in int
1202 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1203 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1204
1205 if(bPackedOutput)
1206 {
1207 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1208
1209 // shuffle mask
1210 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1211 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1212 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1213 // after pshufb: group components together in each 128bit lane
1214 // 256i - 0 1 2 3 4 5 6 7
1215 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1216
1217 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1218 // after PERMD: move and pack xy components into each 128bit lane
1219 // 256i - 0 1 2 3 4 5 6 7
1220 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1221
1222 // do the same for zw components
1223 Value* vi128ZW = nullptr;
1224 if(info.numComps > 2)
1225 {
1226 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1227 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1228 }
1229
1230 for(uint32_t i = 0; i < 4; i++)
1231 {
1232 uint32_t swizzleIndex = info.swizzle[i];
1233 // todo: fixed for packed
1234 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1235 if(i >= info.numComps)
1236 {
1237 // set the default component val
1238 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1239 continue;
1240 }
1241
1242 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1243 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1244 // if x or y, use vi128XY permute result, else use vi128ZW
1245 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1246
1247 // extract packed component 128 bit lanes
1248 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1249 }
1250
1251 }
1252 else
1253 {
1254 // pshufb masks for each component
1255 Value* vConstMask[2];
1256 // x/z shuffle mask
1257 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1258 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1259
1260 // y/w shuffle mask
1261 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1262 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1263
1264
1265 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1266 // apply defaults
1267 for (uint32_t i = 0; i < 4; ++i)
1268 {
1269 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1270 }
1271
1272 for(uint32_t i = 0; i < info.numComps; i++)
1273 {
1274 uint32_t swizzleIndex = info.swizzle[i];
1275
1276 // select correct constMask for x/z or y/w pshufb
1277 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1278 // if x or y, use vi128XY permute result, else use vi128ZW
1279 uint32_t selectedGather = (i < 2) ? 0 : 1;
1280
1281 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1282 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1283 // 256i - 0 1 2 3 4 5 6 7
1284 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1285 }
1286 }
1287 }
1288
1289 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1290 {
1291 // cast types
1292 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1293 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1294
1295 if(bPackedOutput)
1296 {
1297 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1298 // shuffle mask
1299 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1300 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1301 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1302 // after pshufb: group components together in each 128bit lane
1303 // 256i - 0 1 2 3 4 5 6 7
1304 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1305
1306 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1307 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1308 // 256i - 0 1 2 3 4 5 6 7
1309 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1310
1311 // do the same for zw components
1312 Value* vi128ZW = nullptr;
1313 if(info.numComps > 2)
1314 {
1315 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1316 }
1317
1318 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1319 for(uint32_t i = 0; i < 4; i++)
1320 {
1321 uint32_t swizzleIndex = info.swizzle[i];
1322 // todo: fix for packed
1323 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1324 if(i >= info.numComps)
1325 {
1326 // set the default component val
1327 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1328 continue;
1329 }
1330
1331 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1332 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1333 // if x or y, use vi128XY permute result, else use vi128ZW
1334 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1335
1336 // sign extend
1337 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1338 }
1339 }
1340 // else zero extend
1341 else{
1342 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1343 // apply defaults
1344 for (uint32_t i = 0; i < 4; ++i)
1345 {
1346 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1347 }
1348
1349 for(uint32_t i = 0; i < info.numComps; i++){
1350 uint32_t swizzleIndex = info.swizzle[i];
1351
1352 // pshufb masks for each component
1353 Value* vConstMask;
1354 switch(i)
1355 {
1356 case 0:
1357 // x shuffle mask
1358 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1359 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1360 break;
1361 case 1:
1362 // y shuffle mask
1363 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1364 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1365 break;
1366 case 2:
1367 // z shuffle mask
1368 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1369 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1370 break;
1371 case 3:
1372 // w shuffle mask
1373 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1374 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1375 break;
1376 default:
1377 vConstMask = nullptr;
1378 break;
1379 }
1380
1381 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1382 // after pshufb for x channel
1383 // 256i - 0 1 2 3 4 5 6 7
1384 // x000 x000 x000 x000 x000 x000 x000 x000
1385 }
1386 }
1387 }
1388
1389 // Helper function to create alloca in entry block of function
1390 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1391 {
1392 auto saveIP = IRB()->saveIP();
1393 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1394 pFunc->getEntryBlock().begin());
1395 Value* pAlloca = ALLOCA(pType);
1396 if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1397 return pAlloca;
1398 }
1399
1400 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
1401 {
1402 auto saveIP = IRB()->saveIP();
1403 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1404 pFunc->getEntryBlock().begin());
1405 Value* pAlloca = ALLOCA(pType, pArraySize);
1406 if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1407 return pAlloca;
1408 }
1409
1410 //////////////////////////////////////////////////////////////////////////
1411 /// @brief emulates a scatter operation.
1412 /// @param pDst - pointer to destination
1413 /// @param vSrc - vector of src data to scatter
1414 /// @param vOffsets - vector of byte offsets from pDst
1415 /// @param vMask - mask of valid lanes
1416 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1417 {
1418 /* Scatter algorithm
1419
1420 while(Index = BitScanForward(mask))
1421 srcElem = srcVector[Index]
1422 offsetElem = offsetVector[Index]
1423 *(pDst + offsetElem) = srcElem
1424 Update mask (&= ~(1<<Index)
1425
1426 */
1427
1428 BasicBlock* pCurBB = IRB()->GetInsertBlock();
1429 Function* pFunc = pCurBB->getParent();
1430 Type* pSrcTy = vSrc->getType()->getVectorElementType();
1431
1432 // Store vectors on stack
1433 if (pScatterStackSrc == nullptr)
1434 {
1435 // Save off stack allocations and reuse per scatter. Significantly reduces stack
1436 // requirements for shaders with a lot of scatters.
1437 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1438 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1439 }
1440
1441 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1442 Value* pOffsetsArrayPtr = pScatterStackOffsets;
1443 STORE(vSrc, pSrcArrayPtr);
1444 STORE(vOffsets, pOffsetsArrayPtr);
1445
1446 // Cast to pointers for random access
1447 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1448 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1449
1450 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1451
1452 // Get cttz function
1453 Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1454
1455 // Setup loop basic block
1456 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1457
1458 // compute first set bit
1459 Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1460
1461 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1462
1463 // Split current block
1464 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1465
1466 // Remove unconditional jump created by splitBasicBlock
1467 pCurBB->getTerminator()->eraseFromParent();
1468
1469 // Add terminator to end of original block
1470 IRB()->SetInsertPoint(pCurBB);
1471
1472 // Add conditional branch
1473 COND_BR(pIsUndef, pPostLoop, pLoop);
1474
1475 // Add loop basic block contents
1476 IRB()->SetInsertPoint(pLoop);
1477 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1478 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1479
1480 pIndexPhi->addIncoming(pIndex, pCurBB);
1481 pMaskPhi->addIncoming(pMask, pCurBB);
1482
1483 // Extract elements for this index
1484 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1485 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1486
1487 // GEP to this offset in dst
1488 Value* pCurDst = GEP(pDst, pOffsetElem);
1489 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1490 STORE(pSrcElem, pCurDst);
1491
1492 // Update the mask
1493 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1494
1495 // Terminator
1496 Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1497
1498 pIsUndef = ICMP_EQ(pNewIndex, C(32));
1499 COND_BR(pIsUndef, pPostLoop, pLoop);
1500
1501 // Update phi edges
1502 pIndexPhi->addIncoming(pNewIndex, pLoop);
1503 pMaskPhi->addIncoming(pNewMask, pLoop);
1504
1505 // Move builder to beginning of post loop
1506 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1507 }
1508
1509 Value* Builder::VABSPS(Value* a)
1510 {
1511 Value* asInt = BITCAST(a, mSimdInt32Ty);
1512 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1513 return result;
1514 }
1515
1516 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1517 {
1518 Value *lowCmp = ICMP_SLT(src, low);
1519 Value *ret = SELECT(lowCmp, low, src);
1520
1521 Value *highCmp = ICMP_SGT(ret, high);
1522 ret = SELECT(highCmp, high, ret);
1523
1524 return ret;
1525 }
1526
1527 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1528 {
1529 Value *lowCmp = FCMP_OLT(src, low);
1530 Value *ret = SELECT(lowCmp, low, src);
1531
1532 Value *highCmp = FCMP_OGT(ret, high);
1533 ret = SELECT(highCmp, high, ret);
1534
1535 return ret;
1536 }
1537
1538 Value *Builder::FCLAMP(Value* src, float low, float high)
1539 {
1540 Value* result = VMAXPS(src, VIMMED1(low));
1541 result = VMINPS(result, VIMMED1(high));
1542
1543 return result;
1544 }
1545
1546 //////////////////////////////////////////////////////////////////////////
1547 /// @brief save/restore stack, providing ability to push/pop the stack and
1548 /// reduce overall stack requirements for temporary stack use
1549 Value* Builder::STACKSAVE()
1550 {
1551 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1552 return CALLA(pfnStackSave);
1553 }
1554
1555 void Builder::STACKRESTORE(Value* pSaved)
1556 {
1557 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1558 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1559 }
1560
1561 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1562 {
1563 Value* vOut;
1564 // use FMADs if available
1565 if(JM()->mArch.AVX2())
1566 {
1567 vOut = VFMADDPS(a, b, c);
1568 }
1569 else
1570 {
1571 vOut = FADD(FMUL(a, b), c);
1572 }
1573 return vOut;
1574 }
1575
1576 Value* Builder::POPCNT(Value* a)
1577 {
1578 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1579 return CALL(pCtPop, std::initializer_list<Value*>{a});
1580 }
1581
1582 //////////////////////////////////////////////////////////////////////////
1583 /// @brief C functions called by LLVM IR
1584 //////////////////////////////////////////////////////////////////////////
1585
1586 //////////////////////////////////////////////////////////////////////////
1587 /// @brief called in JIT code, inserted by PRINT
1588 /// output to both stdout and visual studio debug console
1589 void __cdecl CallPrint(const char* fmt, ...)
1590 {
1591 va_list args;
1592 va_start(args, fmt);
1593 vprintf(fmt, args);
1594
1595 #if defined( _WIN32 )
1596 char strBuf[1024];
1597 vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1598 OutputDebugString(strBuf);
1599 #endif
1600
1601 va_end(args);
1602 }
1603
1604 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1605 {
1606 bool flag = !imm8->isZeroValue();
1607 SmallVector<Constant*,8> idx;
1608 for (unsigned i = 0; i < mVWidth / 2; i++) {
1609 idx.push_back(C(flag ? i + mVWidth / 2 : i));
1610 }
1611 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1612 }
1613
1614 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1615 {
1616 bool flag = !imm8->isZeroValue();
1617 SmallVector<Constant*,8> idx;
1618 for (unsigned i = 0; i < mVWidth; i++) {
1619 idx.push_back(C(i));
1620 }
1621 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1622
1623 SmallVector<Constant*,8> idx2;
1624 for (unsigned i = 0; i < mVWidth / 2; i++) {
1625 idx2.push_back(C(flag ? i : i + mVWidth));
1626 }
1627 for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1628 idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1629 }
1630 return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1631 }
1632
1633 // rdtsc buckets macros
1634 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1635 {
1636 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1637 // buckets framework when single threaded
1638 if (KNOB_SINGLE_THREADED)
1639 {
1640 std::vector<Type*> args{
1641 PointerType::get(mInt32Ty, 0), // pBucketMgr
1642 mInt32Ty // id
1643 };
1644
1645 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1646 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1647 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1648 {
1649 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1650 }
1651
1652 CALL(pFunc, { pBucketMgr, pId });
1653 }
1654 }
1655
1656 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1657 {
1658 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1659 // buckets framework when single threaded
1660 if (KNOB_SINGLE_THREADED)
1661 {
1662 std::vector<Type*> args{
1663 PointerType::get(mInt32Ty, 0), // pBucketMgr
1664 mInt32Ty // id
1665 };
1666
1667 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1668 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1669 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1670 {
1671 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1672 }
1673
1674 CALL(pFunc, { pBucketMgr, pId });
1675 }
1676 }
1677
1678
1679 uint32_t Builder::GetTypeSize(Type* pType)
1680 {
1681 if (pType->isStructTy())
1682 {
1683 uint32_t numElems = pType->getStructNumElements();
1684 Type* pElemTy = pType->getStructElementType(0);
1685 return numElems * GetTypeSize(pElemTy);
1686 }
1687
1688 if (pType->isArrayTy())
1689 {
1690 uint32_t numElems = pType->getArrayNumElements();
1691 Type* pElemTy = pType->getArrayElementType();
1692 return numElems * GetTypeSize(pElemTy);
1693 }
1694
1695 if (pType->isIntegerTy())
1696 {
1697 uint32_t bitSize = pType->getIntegerBitWidth();
1698 return bitSize / 8;
1699 }
1700
1701 if (pType->isFloatTy())
1702 {
1703 return 4;
1704 }
1705
1706 if (pType->isHalfTy())
1707 {
1708 return 2;
1709 }
1710
1711 if (pType->isDoubleTy())
1712 {
1713 return 8;
1714 }
1715
1716 SWR_ASSERT(false, "Unimplemented type.");
1717 return 0;
1718 }
1719 }