92b07a5bfc05d3995ef156221c77dcc7ec2e0647
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "common/rdtsc_buckets.h"
32
33 #include <cstdarg>
34
35 namespace SwrJit
36 {
37 void __cdecl CallPrint(const char* fmt, ...);
38
39 //////////////////////////////////////////////////////////////////////////
40 /// @brief Convert an IEEE 754 32-bit single precision float to an
41 /// 16 bit float with 5 exponent bits and a variable
42 /// number of mantissa bits.
43 /// @param val - 32-bit float
44 /// @todo Maybe move this outside of this file into a header?
45 static uint16_t ConvertFloat32ToFloat16(float val)
46 {
47 uint32_t sign, exp, mant;
48 uint32_t roundBits;
49
50 // Extract the sign, exponent, and mantissa
51 uint32_t uf = *(uint32_t*)&val;
52 sign = (uf & 0x80000000) >> 31;
53 exp = (uf & 0x7F800000) >> 23;
54 mant = uf & 0x007FFFFF;
55
56 // Check for out of range
57 if (std::isnan(val))
58 {
59 exp = 0x1F;
60 mant = 0x200;
61 sign = 1; // set the sign bit for NANs
62 }
63 else if (std::isinf(val))
64 {
65 exp = 0x1f;
66 mant = 0x0;
67 }
68 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
69 {
70 exp = 0x1E;
71 mant = 0x3FF;
72 }
73 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
74 {
75 mant |= 0x00800000;
76 for (; exp <= 0x70; mant >>= 1, exp++)
77 ;
78 exp = 0;
79 mant = mant >> 13;
80 }
81 else if (exp < 0x66) // Too small to represent -> Zero
82 {
83 exp = 0;
84 mant = 0;
85 }
86 else
87 {
88 // Saves bits that will be shifted off for rounding
89 roundBits = mant & 0x1FFFu;
90 // convert exponent and mantissa to 16 bit format
91 exp = exp - 0x70;
92 mant = mant >> 13;
93
94 // Essentially RTZ, but round up if off by only 1 lsb
95 if (roundBits == 0x1FFFu)
96 {
97 mant++;
98 // check for overflow
99 if ((mant & 0xC00u) != 0)
100 exp++;
101 // make sure only the needed bits are used
102 mant &= 0x3FF;
103 }
104 }
105
106 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
107 return (uint16_t)tmpVal;
108 }
109
110 //////////////////////////////////////////////////////////////////////////
111 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
112 /// float
113 /// @param val - 16-bit float
114 /// @todo Maybe move this outside of this file into a header?
115 static float ConvertFloat16ToFloat32(uint32_t val)
116 {
117 uint32_t result;
118 if ((val & 0x7fff) == 0)
119 {
120 result = ((uint32_t)(val & 0x8000)) << 16;
121 }
122 else if ((val & 0x7c00) == 0x7c00)
123 {
124 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
125 result |= ((uint32_t)val & 0x8000) << 16;
126 }
127 else
128 {
129 uint32_t sign = (val & 0x8000) << 16;
130 uint32_t mant = (val & 0x3ff) << 13;
131 uint32_t exp = (val >> 10) & 0x1f;
132 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
133 {
134 mant <<= 1;
135 while (mant < (0x400 << 13))
136 {
137 exp--;
138 mant <<= 1;
139 }
140 mant &= (0x3ff << 13);
141 }
142 exp = ((exp - 15 + 127) & 0xff) << 23;
143 result = sign | exp | mant;
144 }
145
146 return *(float*)&result;
147 }
148
149 Constant *Builder::C(bool i)
150 {
151 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
152 }
153
154 Constant *Builder::C(char i)
155 {
156 return ConstantInt::get(IRB()->getInt8Ty(), i);
157 }
158
159 Constant *Builder::C(uint8_t i)
160 {
161 return ConstantInt::get(IRB()->getInt8Ty(), i);
162 }
163
164 Constant *Builder::C(int i)
165 {
166 return ConstantInt::get(IRB()->getInt32Ty(), i);
167 }
168
169 Constant *Builder::C(int64_t i)
170 {
171 return ConstantInt::get(IRB()->getInt64Ty(), i);
172 }
173
174 Constant *Builder::C(uint16_t i)
175 {
176 return ConstantInt::get(mInt16Ty,i);
177 }
178
179 Constant *Builder::C(uint32_t i)
180 {
181 return ConstantInt::get(IRB()->getInt32Ty(), i);
182 }
183
184 Constant *Builder::C(float i)
185 {
186 return ConstantFP::get(IRB()->getFloatTy(), i);
187 }
188
189 Constant *Builder::PRED(bool pred)
190 {
191 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
192 }
193
194 Value *Builder::VIMMED1(int i)
195 {
196 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
197 }
198
199 Value *Builder::VIMMED1_16(int i)
200 {
201 return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
202 }
203
204 Value *Builder::VIMMED1(uint32_t i)
205 {
206 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
207 }
208
209 Value *Builder::VIMMED1_16(uint32_t i)
210 {
211 return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
212 }
213
214 Value *Builder::VIMMED1(float i)
215 {
216 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
217 }
218
219 Value *Builder::VIMMED1_16(float i)
220 {
221 return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
222 }
223
224 Value *Builder::VIMMED1(bool i)
225 {
226 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
227 }
228
229 Value *Builder::VIMMED1_16(bool i)
230 {
231 return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
232 }
233
234 Value *Builder::VUNDEF_IPTR()
235 {
236 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
237 }
238
239 Value *Builder::VUNDEF(Type* t)
240 {
241 return UndefValue::get(VectorType::get(t, mVWidth));
242 }
243
244 Value *Builder::VUNDEF_I()
245 {
246 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
247 }
248
249 Value *Builder::VUNDEF_I_16()
250 {
251 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16));
252 }
253
254 Value *Builder::VUNDEF_F()
255 {
256 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
257 }
258
259 Value *Builder::VUNDEF_F_16()
260 {
261 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16));
262 }
263
264 Value *Builder::VUNDEF(Type *ty, uint32_t size)
265 {
266 return UndefValue::get(VectorType::get(ty, size));
267 }
268
269 Value *Builder::VBROADCAST(Value *src)
270 {
271 // check if src is already a vector
272 if (src->getType()->isVectorTy())
273 {
274 return src;
275 }
276
277 return VECTOR_SPLAT(mVWidth, src);
278 }
279
280 Value *Builder::VBROADCAST_16(Value *src)
281 {
282 // check if src is already a vector
283 if (src->getType()->isVectorTy())
284 {
285 return src;
286 }
287
288 return VECTOR_SPLAT(mVWidth16, src);
289 }
290
291 uint32_t Builder::IMMED(Value* v)
292 {
293 SWR_ASSERT(isa<ConstantInt>(v));
294 ConstantInt *pValConst = cast<ConstantInt>(v);
295 return pValConst->getZExtValue();
296 }
297
298 int32_t Builder::S_IMMED(Value* v)
299 {
300 SWR_ASSERT(isa<ConstantInt>(v));
301 ConstantInt *pValConst = cast<ConstantInt>(v);
302 return pValConst->getSExtValue();
303 }
304
305 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
306 {
307 std::vector<Value*> indices;
308 for (auto i : indexList)
309 indices.push_back(i);
310 return GEPA(ptr, indices);
311 }
312
313 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
314 {
315 std::vector<Value*> indices;
316 for (auto i : indexList)
317 indices.push_back(C(i));
318 return GEPA(ptr, indices);
319 }
320
321 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
322 {
323 std::vector<Value*> indices;
324 for (auto i : indexList)
325 indices.push_back(i);
326 return IN_BOUNDS_GEP(ptr, indices);
327 }
328
329 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
330 {
331 std::vector<Value*> indices;
332 for (auto i : indexList)
333 indices.push_back(C(i));
334 return IN_BOUNDS_GEP(ptr, indices);
335 }
336
337 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
338 {
339 std::vector<Value*> valIndices;
340 for (auto i : indices)
341 valIndices.push_back(C(i));
342 return LOAD(GEPA(basePtr, valIndices), name);
343 }
344
345 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
346 {
347 std::vector<Value*> valIndices;
348 for (auto i : indices)
349 valIndices.push_back(i);
350 return LOAD(GEPA(basePtr, valIndices), name);
351 }
352
353 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
354 {
355 std::vector<Value*> valIndices;
356 for (auto i : indices)
357 valIndices.push_back(C(i));
358 return STORE(val, GEPA(basePtr, valIndices));
359 }
360
361 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
362 {
363 std::vector<Value*> valIndices;
364 for (auto i : indices)
365 valIndices.push_back(i);
366 return STORE(val, GEPA(basePtr, valIndices));
367 }
368
369 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
370 {
371 std::vector<Value*> args;
372 for (auto arg : argsList)
373 args.push_back(arg);
374 return CALLA(Callee, args);
375 }
376
377 CallInst *Builder::CALL(Value *Callee, Value* arg)
378 {
379 std::vector<Value*> args;
380 args.push_back(arg);
381 return CALLA(Callee, args);
382 }
383
384 CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
385 {
386 std::vector<Value*> args;
387 args.push_back(arg1);
388 args.push_back(arg2);
389 return CALLA(Callee, args);
390 }
391
392 CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
393 {
394 std::vector<Value*> args;
395 args.push_back(arg1);
396 args.push_back(arg2);
397 args.push_back(arg3);
398 return CALLA(Callee, args);
399 }
400
401 //////////////////////////////////////////////////////////////////////////
402 Value *Builder::DEBUGTRAP()
403 {
404 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
405 return CALL(func);
406 }
407
408 Value *Builder::VRCP(Value *va)
409 {
410 return FDIV(VIMMED1(1.0f), va); // 1 / a
411 }
412
413 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
414 {
415 Value* vOut = FMADDPS(vA, vX, vC);
416 vOut = FMADDPS(vB, vY, vOut);
417 return vOut;
418 }
419
420 //////////////////////////////////////////////////////////////////////////
421 /// @brief Generate an i32 masked load operation in LLVM IR. If not
422 /// supported on the underlying platform, emulate it with float masked load
423 /// @param src - base address pointer for the load
424 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
425 Value *Builder::MASKLOADD(Value* src,Value* mask)
426 {
427 Value* vResult;
428 // use avx2 gather instruction is available
429 if(JM()->mArch.AVX2())
430 {
431 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
432 vResult = CALL(func,{src,mask});
433 }
434 else
435 {
436 // maskload intrinsic expects integer mask operand in llvm >= 3.8
437 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
438 mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
439 #else
440 mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
441 #endif
442 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
443 vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
444 }
445 return vResult;
446 }
447
448 //////////////////////////////////////////////////////////////////////////
449 /// @brief insert a JIT call to CallPrint
450 /// - outputs formatted string to both stdout and VS output window
451 /// - DEBUG builds only
452 /// Usage example:
453 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
454 /// where C(lane) creates a constant value to print, and pIndex is the Value*
455 /// result from a GEP, printing out the pointer to memory
456 /// @param printStr - constant string to print, which includes format specifiers
457 /// @param printArgs - initializer list of Value*'s to print to std out
458 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
459 {
460 // push the arguments to CallPrint into a vector
461 std::vector<Value*> printCallArgs;
462 // save room for the format string. we still need to modify it for vectors
463 printCallArgs.resize(1);
464
465 // search through the format string for special processing
466 size_t pos = 0;
467 std::string tempStr(printStr);
468 pos = tempStr.find('%', pos);
469 auto v = printArgs.begin();
470
471 while ((pos != std::string::npos) && (v != printArgs.end()))
472 {
473 Value* pArg = *v;
474 Type* pType = pArg->getType();
475
476 if (pType->isVectorTy())
477 {
478 Type* pContainedType = pType->getContainedType(0);
479
480 if (toupper(tempStr[pos + 1]) == 'X')
481 {
482 tempStr[pos] = '0';
483 tempStr[pos + 1] = 'x';
484 tempStr.insert(pos + 2, "%08X ");
485 pos += 7;
486
487 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
488
489 std::string vectorFormatStr;
490 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
491 {
492 vectorFormatStr += "0x%08X ";
493 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
494 }
495
496 tempStr.insert(pos, vectorFormatStr);
497 pos += vectorFormatStr.size();
498 }
499 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
500 {
501 uint32_t i = 0;
502 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
503 {
504 tempStr.insert(pos, std::string("%f "));
505 pos += 3;
506 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
507 }
508 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
509 }
510 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
511 {
512 uint32_t i = 0;
513 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
514 {
515 tempStr.insert(pos, std::string("%d "));
516 pos += 3;
517 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
518 }
519 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
520 }
521 }
522 else
523 {
524 if (toupper(tempStr[pos + 1]) == 'X')
525 {
526 tempStr[pos] = '0';
527 tempStr.insert(pos + 1, "x%08");
528 printCallArgs.push_back(pArg);
529 pos += 3;
530 }
531 // for %f we need to cast float Values to doubles so that they print out correctly
532 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
533 {
534 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
535 pos++;
536 }
537 else
538 {
539 printCallArgs.push_back(pArg);
540 }
541 }
542
543 // advance to the next arguement
544 v++;
545 pos = tempStr.find('%', ++pos);
546 }
547
548 // create global variable constant string
549 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
550 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
551 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
552
553 // get a pointer to the first character in the constant string array
554 std::vector<Constant*> geplist{C(0),C(0)};
555 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
556
557 // insert the pointer to the format string in the argument vector
558 printCallArgs[0] = strGEP;
559
560 // get pointer to CallPrint function and insert decl into the module if needed
561 std::vector<Type*> args;
562 args.push_back(PointerType::get(mInt8Ty,0));
563 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
564 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
565
566 // if we haven't yet added the symbol to the symbol table
567 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
568 {
569 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
570 }
571
572 // insert a call to CallPrint
573 return CALLA(callPrintFn,printCallArgs);
574 }
575
576 //////////////////////////////////////////////////////////////////////////
577 /// @brief Wrapper around PRINT with initializer list.
578 CallInst* Builder::PRINT(const std::string &printStr)
579 {
580 return PRINT(printStr, {});
581 }
582
583 //////////////////////////////////////////////////////////////////////////
584 /// @brief Generate a masked gather operation in LLVM IR. If not
585 /// supported on the underlying platform, emulate it with loads
586 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
587 /// @param pBase - Int8* base VB address pointer value
588 /// @param vIndices - SIMD wide value of VB byte offsets
589 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
590 /// @param scale - value to scale indices by
591 Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
592 {
593 Value *vGather;
594
595 // use avx2 gather instruction if available
596 if(JM()->mArch.AVX2())
597 {
598 // force mask to <N x float>, required by vgather
599 Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
600
601 vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
602 }
603 else
604 {
605 Value* pStack = STACKSAVE();
606
607 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
608 Value* vSrcPtr = ALLOCA(vSrc->getType());
609 STORE(vSrc, vSrcPtr);
610
611 vGather = VUNDEF_F();
612 Value *vScaleVec = VIMMED1((uint32_t)scale);
613 Value *vOffsets = MUL(vIndices,vScaleVec);
614 for(uint32_t i = 0; i < mVWidth; ++i)
615 {
616 // single component byte index
617 Value *offset = VEXTRACT(vOffsets,C(i));
618 // byte pointer to component
619 Value *loadAddress = GEP(pBase,offset);
620 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
621 // pointer to the value to load if we're masking off a component
622 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
623 Value *selMask = VEXTRACT(vMask,C(i));
624 // switch in a safe address to load if we're trying to access a vertex
625 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
626 Value *val = LOAD(validAddress);
627 vGather = VINSERT(vGather,val,C(i));
628 }
629
630 STACKRESTORE(pStack);
631 }
632
633 return vGather;
634 }
635
636 Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
637 {
638 Value *vGather = VUNDEF_F_16();
639
640 // use AVX512F gather instruction if available
641 if (JM()->mArch.AVX512F())
642 {
643 // force mask to <N-bit Integer>, required by vgather2
644 Value *mask = BITCAST(vMask, mInt16Ty);
645
646 vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
647 }
648 else
649 {
650 Value *src0 = EXTRACT_16(vSrc, 0);
651 Value *src1 = EXTRACT_16(vSrc, 1);
652
653 Value *indices0 = EXTRACT_16(vIndices, 0);
654 Value *indices1 = EXTRACT_16(vIndices, 1);
655
656 Value *mask0 = EXTRACT_16(vMask, 0);
657 Value *mask1 = EXTRACT_16(vMask, 1);
658
659 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
660 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
661
662 vGather = JOIN_16(gather0, gather1);
663 }
664
665 return vGather;
666 }
667
668 //////////////////////////////////////////////////////////////////////////
669 /// @brief Generate a masked gather operation in LLVM IR. If not
670 /// supported on the underlying platform, emulate it with loads
671 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
672 /// @param pBase - Int8* base VB address pointer value
673 /// @param vIndices - SIMD wide value of VB byte offsets
674 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
675 /// @param scale - value to scale indices by
676 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
677 {
678 Value* vGather;
679
680 // use avx2 gather instruction if available
681 if(JM()->mArch.AVX2())
682 {
683 vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
684 }
685 else
686 {
687 Value* pStack = STACKSAVE();
688
689 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
690 Value* vSrcPtr = ALLOCA(vSrc->getType());
691 STORE(vSrc, vSrcPtr);
692
693 vGather = VUNDEF_I();
694 Value *vScaleVec = VIMMED1((uint32_t)scale);
695 Value *vOffsets = MUL(vIndices, vScaleVec);
696 for(uint32_t i = 0; i < mVWidth; ++i)
697 {
698 // single component byte index
699 Value *offset = VEXTRACT(vOffsets, C(i));
700 // byte pointer to component
701 Value *loadAddress = GEP(pBase, offset);
702 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
703 // pointer to the value to load if we're masking off a component
704 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
705 Value *selMask = VEXTRACT(vMask, C(i));
706 // switch in a safe address to load if we're trying to access a vertex
707 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
708 Value *val = LOAD(validAddress, C(0));
709 vGather = VINSERT(vGather, val, C(i));
710 }
711
712 STACKRESTORE(pStack);
713 }
714
715 return vGather;
716 }
717
718 Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
719 {
720 Value *vGather = VUNDEF_I_16();
721
722 // use AVX512F gather instruction if available
723 if (JM()->mArch.AVX512F())
724 {
725 // force mask to <N-bit Integer>, required by vgather2
726 Value *mask = BITCAST(vMask, mInt16Ty);
727
728 vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
729 }
730 else
731 {
732 Value *src0 = EXTRACT_16(vSrc, 0);
733 Value *src1 = EXTRACT_16(vSrc, 1);
734
735 Value *indices0 = EXTRACT_16(vIndices, 0);
736 Value *indices1 = EXTRACT_16(vIndices, 1);
737
738 Value *mask0 = EXTRACT_16(vMask, 0);
739 Value *mask1 = EXTRACT_16(vMask, 1);
740
741 Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
742 Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
743
744 vGather = JOIN_16(gather0, gather1);
745 }
746
747 return vGather;
748 }
749
750 //////////////////////////////////////////////////////////////////////////
751 /// @brief Generate a masked gather operation in LLVM IR. If not
752 /// supported on the underlying platform, emulate it with loads
753 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
754 /// @param pBase - Int8* base VB address pointer value
755 /// @param vIndices - SIMD wide value of VB byte offsets
756 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
757 /// @param scale - value to scale indices by
758 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
759 {
760 Value* vGather;
761
762 // use avx2 gather instruction if available
763 if(JM()->mArch.AVX2())
764 {
765 vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
766 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
767 }
768 else
769 {
770 Value* pStack = STACKSAVE();
771
772 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
773 Value* vSrcPtr = ALLOCA(vSrc->getType());
774 STORE(vSrc, vSrcPtr);
775
776 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
777 Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
778 Value *vOffsets = MUL(vIndices,vScaleVec);
779 for(uint32_t i = 0; i < mVWidth/2; ++i)
780 {
781 // single component byte index
782 Value *offset = VEXTRACT(vOffsets,C(i));
783 // byte pointer to component
784 Value *loadAddress = GEP(pBase,offset);
785 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
786 // pointer to the value to load if we're masking off a component
787 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
788 Value *selMask = VEXTRACT(vMask,C(i));
789 // switch in a safe address to load if we're trying to access a vertex
790 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
791 Value *val = LOAD(validAddress);
792 vGather = VINSERT(vGather,val,C(i));
793 }
794 STACKRESTORE(pStack);
795 }
796 return vGather;
797 }
798
799 Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
800 {
801 if (imm == 0)
802 {
803 return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 });
804 }
805 else
806 {
807 return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 });
808 }
809 }
810
811 Value *Builder::JOIN_16(Value *a, Value *b)
812 {
813 return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
814 }
815
816 //////////////////////////////////////////////////////////////////////////
817 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
818 Value *Builder::MASK(Value *vmask)
819 {
820 Value *src = BITCAST(vmask, mSimdInt32Ty);
821 return ICMP_SLT(src, VIMMED1(0));
822 }
823
824 Value *Builder::MASK_16(Value *vmask)
825 {
826 Value *src = BITCAST(vmask, mSimd16Int32Ty);
827 return ICMP_SLT(src, VIMMED1_16(0));
828 }
829
830 //////////////////////////////////////////////////////////////////////////
831 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
832 Value *Builder::VMASK(Value *mask)
833 {
834 return S_EXT(mask, mSimdInt32Ty);
835 }
836
837 Value *Builder::VMASK_16(Value *mask)
838 {
839 return S_EXT(mask, mSimd16Int32Ty);
840 }
841
842 //////////////////////////////////////////////////////////////////////////
843 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
844 /// supported on the underlying platform, emulate it
845 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
846 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
847 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
848 /// 128bits of a, and vice versa for the upper lanes. If the mask
849 /// value is negative, '0' is inserted.
850 Value *Builder::PSHUFB(Value* a, Value* b)
851 {
852 Value* res;
853 // use avx2 pshufb instruction if available
854 if(JM()->mArch.AVX2())
855 {
856 res = VPSHUFB(a, b);
857 }
858 else
859 {
860 Constant* cB = dyn_cast<Constant>(b);
861 // number of 8 bit elements in b
862 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
863 // output vector
864 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
865
866 // insert an 8 bit value from the high and low lanes of a per loop iteration
867 numElms /= 2;
868 for(uint32_t i = 0; i < numElms; i++)
869 {
870 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
871 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
872
873 // extract values from constant mask
874 char valLow128bLane = (char)(cLow128b->getSExtValue());
875 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
876
877 Value* insertValLow128b;
878 Value* insertValHigh128b;
879
880 // if the mask value is negative, insert a '0' in the respective output position
881 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
882 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
883 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
884
885 vShuf = VINSERT(vShuf, insertValLow128b, i);
886 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
887 }
888 res = vShuf;
889 }
890 return res;
891 }
892
893 //////////////////////////////////////////////////////////////////////////
894 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
895 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
896 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
897 /// lower 8 values are used.
898 Value *Builder::PMOVSXBD(Value* a)
899 {
900 // VPMOVSXBD output type
901 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
902 // Extract 8 values from 128bit lane and sign extend
903 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
904 }
905
906 //////////////////////////////////////////////////////////////////////////
907 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
908 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
909 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
910 Value *Builder::PMOVSXWD(Value* a)
911 {
912 // VPMOVSXWD output type
913 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
914 // Extract 8 values from 128bit lane and sign extend
915 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
916 }
917
918 //////////////////////////////////////////////////////////////////////////
919 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
920 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
921 /// platform, emulate it
922 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
923 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
924 Value *Builder::PERMD(Value* a, Value* idx)
925 {
926 Value* res;
927 // use avx2 permute instruction if available
928 if(JM()->mArch.AVX2())
929 {
930 res = VPERMD(a, idx);
931 }
932 else
933 {
934 if (isa<Constant>(idx))
935 {
936 res = VSHUFFLE(a, a, idx);
937 }
938 else
939 {
940 res = VUNDEF_I();
941 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
942 {
943 Value* pIndex = VEXTRACT(idx, C(l));
944 Value* pVal = VEXTRACT(a, pIndex);
945 res = VINSERT(res, pVal, C(l));
946 }
947 }
948 }
949 return res;
950 }
951
952 //////////////////////////////////////////////////////////////////////////
953 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
954 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
955 /// platform, emulate it
956 /// @param a - 256bit SIMD lane(8x32bit) of float values.
957 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
958 Value *Builder::PERMPS(Value* a, Value* idx)
959 {
960 Value* res;
961 // use avx2 permute instruction if available
962 if (JM()->mArch.AVX2())
963 {
964 // llvm 3.6.0 swapped the order of the args to vpermd
965 res = VPERMPS(idx, a);
966 }
967 else
968 {
969 if (isa<Constant>(idx))
970 {
971 res = VSHUFFLE(a, a, idx);
972 }
973 else
974 {
975 res = VUNDEF_F();
976 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
977 {
978 Value* pIndex = VEXTRACT(idx, C(l));
979 Value* pVal = VEXTRACT(a, pIndex);
980 res = VINSERT(res, pVal, C(l));
981 }
982 }
983 }
984
985 return res;
986 }
987
988 //////////////////////////////////////////////////////////////////////////
989 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
990 /// in LLVM IR. If not supported on the underlying platform, emulate it
991 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
992 Value *Builder::CVTPH2PS(Value* a)
993 {
994 if (JM()->mArch.F16C())
995 {
996 return VCVTPH2PS(a);
997 }
998 else
999 {
1000 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
1001 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
1002
1003 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
1004 {
1005 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
1006 }
1007
1008 Value* pResult = UndefValue::get(mSimdFP32Ty);
1009 for (uint32_t i = 0; i < mVWidth; ++i)
1010 {
1011 Value* pSrc = VEXTRACT(a, C(i));
1012 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
1013 pResult = VINSERT(pResult, pConv, C(i));
1014 }
1015
1016 return pResult;
1017 }
1018 }
1019
1020 //////////////////////////////////////////////////////////////////////////
1021 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
1022 /// in LLVM IR. If not supported on the underlying platform, emulate it
1023 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
1024 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
1025 {
1026 if (JM()->mArch.F16C())
1027 {
1028 return VCVTPS2PH(a, rounding);
1029 }
1030 else
1031 {
1032 // call scalar C function for now
1033 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
1034 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
1035
1036 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
1037 {
1038 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
1039 }
1040
1041 Value* pResult = UndefValue::get(mSimdInt16Ty);
1042 for (uint32_t i = 0; i < mVWidth; ++i)
1043 {
1044 Value* pSrc = VEXTRACT(a, C(i));
1045 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
1046 pResult = VINSERT(pResult, pConv, C(i));
1047 }
1048
1049 return pResult;
1050 }
1051 }
1052
1053 Value *Builder::PMAXSD(Value* a, Value* b)
1054 {
1055 Value* cmp = ICMP_SGT(a, b);
1056 return SELECT(cmp, a, b);
1057 }
1058
1059 Value *Builder::PMINSD(Value* a, Value* b)
1060 {
1061 Value* cmp = ICMP_SLT(a, b);
1062 return SELECT(cmp, a, b);
1063 }
1064
1065 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1066 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1067 {
1068 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1069 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1070 {
1071 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1072 }
1073 else
1074 {
1075 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1076 }
1077 }
1078
1079 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1080 Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1081 {
1082 switch(info.bpp / info.numComps)
1083 {
1084 case 16:
1085 {
1086 Value* vGatherResult[2];
1087
1088 // TODO: vGatherMaskedVal
1089 Value* vGatherMaskedVal = VIMMED1((float)0);
1090
1091 // always have at least one component out of x or y to fetch
1092
1093 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1094 // e.g. result of first 8x32bit integer gather for 16bit components
1095 // 256i - 0 1 2 3 4 5 6 7
1096 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1097 //
1098
1099 // if we have at least one component out of x or y to fetch
1100 if(info.numComps > 2)
1101 {
1102 // offset base to the next components(zw) in the vertex to gather
1103 pSrcBase = GEP(pSrcBase, C((char)4));
1104
1105 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1106 // e.g. result of second 8x32bit integer gather for 16bit components
1107 // 256i - 0 1 2 3 4 5 6 7
1108 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1109 //
1110 }
1111 else
1112 {
1113 vGatherResult[1] = vGatherMaskedVal;
1114 }
1115
1116 // Shuffle gathered components into place, each row is a component
1117 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1118 }
1119 break;
1120 case 32:
1121 {
1122 // apply defaults
1123 for (uint32_t i = 0; i < 4; ++i)
1124 {
1125 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1126 }
1127
1128 for(uint32_t i = 0; i < info.numComps; i++)
1129 {
1130 uint32_t swizzleIndex = info.swizzle[i];
1131
1132 // Gather a SIMD of components
1133 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1134
1135 // offset base to the next component to gather
1136 pSrcBase = GEP(pSrcBase, C((char)4));
1137 }
1138 }
1139 break;
1140 default:
1141 SWR_INVALID("Invalid float format");
1142 break;
1143 }
1144 }
1145
1146 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1147 Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1148 {
1149 switch (info.bpp / info.numComps)
1150 {
1151 case 8:
1152 {
1153 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1154 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1155 // e.g. result of an 8x32bit integer gather for 8bit components
1156 // 256i - 0 1 2 3 4 5 6 7
1157 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1158
1159 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1160 }
1161 break;
1162 case 16:
1163 {
1164 Value* vGatherResult[2];
1165
1166 // TODO: vGatherMaskedVal
1167 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1168
1169 // always have at least one component out of x or y to fetch
1170
1171 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1172 // e.g. result of first 8x32bit integer gather for 16bit components
1173 // 256i - 0 1 2 3 4 5 6 7
1174 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1175 //
1176
1177 // if we have at least one component out of x or y to fetch
1178 if(info.numComps > 2)
1179 {
1180 // offset base to the next components(zw) in the vertex to gather
1181 pSrcBase = GEP(pSrcBase, C((char)4));
1182
1183 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1184 // e.g. result of second 8x32bit integer gather for 16bit components
1185 // 256i - 0 1 2 3 4 5 6 7
1186 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1187 //
1188 }
1189 else
1190 {
1191 vGatherResult[1] = vGatherMaskedVal;
1192 }
1193
1194 // Shuffle gathered components into place, each row is a component
1195 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1196
1197 }
1198 break;
1199 case 32:
1200 {
1201 // apply defaults
1202 for (uint32_t i = 0; i < 4; ++i)
1203 {
1204 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1205 }
1206
1207 for(uint32_t i = 0; i < info.numComps; i++)
1208 {
1209 uint32_t swizzleIndex = info.swizzle[i];
1210
1211 // Gather a SIMD of components
1212 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1213
1214 // offset base to the next component to gather
1215 pSrcBase = GEP(pSrcBase, C((char)4));
1216 }
1217 }
1218 break;
1219 default:
1220 SWR_INVALID("unsupported format");
1221 break;
1222 }
1223 }
1224
1225 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1226 {
1227 // cast types
1228 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1229 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1230
1231 // input could either be float or int vector; do shuffle work in int
1232 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1233 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1234
1235 if(bPackedOutput)
1236 {
1237 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1238
1239 // shuffle mask
1240 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1241 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1242 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1243 // after pshufb: group components together in each 128bit lane
1244 // 256i - 0 1 2 3 4 5 6 7
1245 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1246
1247 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1248 // after PERMD: move and pack xy components into each 128bit lane
1249 // 256i - 0 1 2 3 4 5 6 7
1250 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1251
1252 // do the same for zw components
1253 Value* vi128ZW = nullptr;
1254 if(info.numComps > 2)
1255 {
1256 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1257 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1258 }
1259
1260 for(uint32_t i = 0; i < 4; i++)
1261 {
1262 uint32_t swizzleIndex = info.swizzle[i];
1263 // todo: fixed for packed
1264 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1265 if(i >= info.numComps)
1266 {
1267 // set the default component val
1268 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1269 continue;
1270 }
1271
1272 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1273 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1274 // if x or y, use vi128XY permute result, else use vi128ZW
1275 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1276
1277 // extract packed component 128 bit lanes
1278 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1279 }
1280
1281 }
1282 else
1283 {
1284 // pshufb masks for each component
1285 Value* vConstMask[2];
1286 // x/z shuffle mask
1287 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1288 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1289
1290 // y/w shuffle mask
1291 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1292 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1293
1294
1295 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1296 // apply defaults
1297 for (uint32_t i = 0; i < 4; ++i)
1298 {
1299 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1300 }
1301
1302 for(uint32_t i = 0; i < info.numComps; i++)
1303 {
1304 uint32_t swizzleIndex = info.swizzle[i];
1305
1306 // select correct constMask for x/z or y/w pshufb
1307 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1308 // if x or y, use vi128XY permute result, else use vi128ZW
1309 uint32_t selectedGather = (i < 2) ? 0 : 1;
1310
1311 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1312 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1313 // 256i - 0 1 2 3 4 5 6 7
1314 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1315 }
1316 }
1317 }
1318
1319 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1320 {
1321 // cast types
1322 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1323 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1324
1325 if(bPackedOutput)
1326 {
1327 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1328 // shuffle mask
1329 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1330 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1331 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1332 // after pshufb: group components together in each 128bit lane
1333 // 256i - 0 1 2 3 4 5 6 7
1334 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1335
1336 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1337 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1338 // 256i - 0 1 2 3 4 5 6 7
1339 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1340
1341 // do the same for zw components
1342 Value* vi128ZW = nullptr;
1343 if(info.numComps > 2)
1344 {
1345 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1346 }
1347
1348 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1349 for(uint32_t i = 0; i < 4; i++)
1350 {
1351 uint32_t swizzleIndex = info.swizzle[i];
1352 // todo: fix for packed
1353 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1354 if(i >= info.numComps)
1355 {
1356 // set the default component val
1357 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1358 continue;
1359 }
1360
1361 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1362 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1363 // if x or y, use vi128XY permute result, else use vi128ZW
1364 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1365
1366 // sign extend
1367 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1368 }
1369 }
1370 // else zero extend
1371 else{
1372 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1373 // apply defaults
1374 for (uint32_t i = 0; i < 4; ++i)
1375 {
1376 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1377 }
1378
1379 for(uint32_t i = 0; i < info.numComps; i++){
1380 uint32_t swizzleIndex = info.swizzle[i];
1381
1382 // pshufb masks for each component
1383 Value* vConstMask;
1384 switch(i)
1385 {
1386 case 0:
1387 // x shuffle mask
1388 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1389 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1390 break;
1391 case 1:
1392 // y shuffle mask
1393 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1394 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1395 break;
1396 case 2:
1397 // z shuffle mask
1398 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1399 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1400 break;
1401 case 3:
1402 // w shuffle mask
1403 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1404 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1405 break;
1406 default:
1407 vConstMask = nullptr;
1408 break;
1409 }
1410
1411 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1412 // after pshufb for x channel
1413 // 256i - 0 1 2 3 4 5 6 7
1414 // x000 x000 x000 x000 x000 x000 x000 x000
1415 }
1416 }
1417 }
1418
1419 // Helper function to create alloca in entry block of function
1420 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1421 {
1422 auto saveIP = IRB()->saveIP();
1423 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1424 pFunc->getEntryBlock().begin());
1425 Value* pAlloca = ALLOCA(pType);
1426 if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1427 return pAlloca;
1428 }
1429
1430 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
1431 {
1432 auto saveIP = IRB()->saveIP();
1433 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1434 pFunc->getEntryBlock().begin());
1435 Value* pAlloca = ALLOCA(pType, pArraySize);
1436 if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1437 return pAlloca;
1438 }
1439
1440 //////////////////////////////////////////////////////////////////////////
1441 /// @brief emulates a scatter operation.
1442 /// @param pDst - pointer to destination
1443 /// @param vSrc - vector of src data to scatter
1444 /// @param vOffsets - vector of byte offsets from pDst
1445 /// @param vMask - mask of valid lanes
1446 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1447 {
1448 /* Scatter algorithm
1449
1450 while(Index = BitScanForward(mask))
1451 srcElem = srcVector[Index]
1452 offsetElem = offsetVector[Index]
1453 *(pDst + offsetElem) = srcElem
1454 Update mask (&= ~(1<<Index)
1455
1456 */
1457
1458 BasicBlock* pCurBB = IRB()->GetInsertBlock();
1459 Function* pFunc = pCurBB->getParent();
1460 Type* pSrcTy = vSrc->getType()->getVectorElementType();
1461
1462 // Store vectors on stack
1463 if (pScatterStackSrc == nullptr)
1464 {
1465 // Save off stack allocations and reuse per scatter. Significantly reduces stack
1466 // requirements for shaders with a lot of scatters.
1467 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1468 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1469 }
1470
1471 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1472 Value* pOffsetsArrayPtr = pScatterStackOffsets;
1473 STORE(vSrc, pSrcArrayPtr);
1474 STORE(vOffsets, pOffsetsArrayPtr);
1475
1476 // Cast to pointers for random access
1477 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1478 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1479
1480 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1481
1482 // Get cttz function
1483 Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1484
1485 // Setup loop basic block
1486 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1487
1488 // compute first set bit
1489 Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1490
1491 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1492
1493 // Split current block
1494 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1495
1496 // Remove unconditional jump created by splitBasicBlock
1497 pCurBB->getTerminator()->eraseFromParent();
1498
1499 // Add terminator to end of original block
1500 IRB()->SetInsertPoint(pCurBB);
1501
1502 // Add conditional branch
1503 COND_BR(pIsUndef, pPostLoop, pLoop);
1504
1505 // Add loop basic block contents
1506 IRB()->SetInsertPoint(pLoop);
1507 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1508 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1509
1510 pIndexPhi->addIncoming(pIndex, pCurBB);
1511 pMaskPhi->addIncoming(pMask, pCurBB);
1512
1513 // Extract elements for this index
1514 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1515 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1516
1517 // GEP to this offset in dst
1518 Value* pCurDst = GEP(pDst, pOffsetElem);
1519 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1520 STORE(pSrcElem, pCurDst);
1521
1522 // Update the mask
1523 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1524
1525 // Terminator
1526 Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1527
1528 pIsUndef = ICMP_EQ(pNewIndex, C(32));
1529 COND_BR(pIsUndef, pPostLoop, pLoop);
1530
1531 // Update phi edges
1532 pIndexPhi->addIncoming(pNewIndex, pLoop);
1533 pMaskPhi->addIncoming(pNewMask, pLoop);
1534
1535 // Move builder to beginning of post loop
1536 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1537 }
1538
1539 Value* Builder::VABSPS(Value* a)
1540 {
1541 Value* asInt = BITCAST(a, mSimdInt32Ty);
1542 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1543 return result;
1544 }
1545
1546 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1547 {
1548 Value *lowCmp = ICMP_SLT(src, low);
1549 Value *ret = SELECT(lowCmp, low, src);
1550
1551 Value *highCmp = ICMP_SGT(ret, high);
1552 ret = SELECT(highCmp, high, ret);
1553
1554 return ret;
1555 }
1556
1557 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1558 {
1559 Value *lowCmp = FCMP_OLT(src, low);
1560 Value *ret = SELECT(lowCmp, low, src);
1561
1562 Value *highCmp = FCMP_OGT(ret, high);
1563 ret = SELECT(highCmp, high, ret);
1564
1565 return ret;
1566 }
1567
1568 Value *Builder::FCLAMP(Value* src, float low, float high)
1569 {
1570 Value* result = VMAXPS(src, VIMMED1(low));
1571 result = VMINPS(result, VIMMED1(high));
1572
1573 return result;
1574 }
1575
1576 //////////////////////////////////////////////////////////////////////////
1577 /// @brief save/restore stack, providing ability to push/pop the stack and
1578 /// reduce overall stack requirements for temporary stack use
1579 Value* Builder::STACKSAVE()
1580 {
1581 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1582 return CALLA(pfnStackSave);
1583 }
1584
1585 void Builder::STACKRESTORE(Value* pSaved)
1586 {
1587 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1588 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1589 }
1590
1591 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1592 {
1593 Value* vOut;
1594 // use FMADs if available
1595 if(JM()->mArch.AVX2())
1596 {
1597 vOut = VFMADDPS(a, b, c);
1598 }
1599 else
1600 {
1601 vOut = FADD(FMUL(a, b), c);
1602 }
1603 return vOut;
1604 }
1605
1606 Value* Builder::POPCNT(Value* a)
1607 {
1608 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1609 return CALL(pCtPop, std::initializer_list<Value*>{a});
1610 }
1611
1612 //////////////////////////////////////////////////////////////////////////
1613 /// @brief C functions called by LLVM IR
1614 //////////////////////////////////////////////////////////////////////////
1615
1616 //////////////////////////////////////////////////////////////////////////
1617 /// @brief called in JIT code, inserted by PRINT
1618 /// output to both stdout and visual studio debug console
1619 void __cdecl CallPrint(const char* fmt, ...)
1620 {
1621 va_list args;
1622 va_start(args, fmt);
1623 vprintf(fmt, args);
1624
1625 #if defined( _WIN32 )
1626 char strBuf[1024];
1627 vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1628 OutputDebugStringA(strBuf);
1629 #endif
1630
1631 va_end(args);
1632 }
1633
1634 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1635 {
1636 bool flag = !imm8->isZeroValue();
1637 SmallVector<Constant*,8> idx;
1638 for (unsigned i = 0; i < mVWidth / 2; i++) {
1639 idx.push_back(C(flag ? i + mVWidth / 2 : i));
1640 }
1641 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1642 }
1643
1644 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1645 {
1646 bool flag = !imm8->isZeroValue();
1647 SmallVector<Constant*,8> idx;
1648 for (unsigned i = 0; i < mVWidth; i++) {
1649 idx.push_back(C(i));
1650 }
1651 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1652
1653 SmallVector<Constant*,8> idx2;
1654 for (unsigned i = 0; i < mVWidth / 2; i++) {
1655 idx2.push_back(C(flag ? i : i + mVWidth));
1656 }
1657 for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1658 idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1659 }
1660 return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1661 }
1662
1663 // rdtsc buckets macros
1664 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1665 {
1666 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1667 // buckets framework when single threaded
1668 if (KNOB_SINGLE_THREADED)
1669 {
1670 std::vector<Type*> args{
1671 PointerType::get(mInt32Ty, 0), // pBucketMgr
1672 mInt32Ty // id
1673 };
1674
1675 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1676 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1677 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1678 {
1679 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1680 }
1681
1682 CALL(pFunc, { pBucketMgr, pId });
1683 }
1684 }
1685
1686 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1687 {
1688 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1689 // buckets framework when single threaded
1690 if (KNOB_SINGLE_THREADED)
1691 {
1692 std::vector<Type*> args{
1693 PointerType::get(mInt32Ty, 0), // pBucketMgr
1694 mInt32Ty // id
1695 };
1696
1697 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1698 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1699 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1700 {
1701 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1702 }
1703
1704 CALL(pFunc, { pBucketMgr, pId });
1705 }
1706 }
1707
1708
1709 uint32_t Builder::GetTypeSize(Type* pType)
1710 {
1711 if (pType->isStructTy())
1712 {
1713 uint32_t numElems = pType->getStructNumElements();
1714 Type* pElemTy = pType->getStructElementType(0);
1715 return numElems * GetTypeSize(pElemTy);
1716 }
1717
1718 if (pType->isArrayTy())
1719 {
1720 uint32_t numElems = pType->getArrayNumElements();
1721 Type* pElemTy = pType->getArrayElementType();
1722 return numElems * GetTypeSize(pElemTy);
1723 }
1724
1725 if (pType->isIntegerTy())
1726 {
1727 uint32_t bitSize = pType->getIntegerBitWidth();
1728 return bitSize / 8;
1729 }
1730
1731 if (pType->isFloatTy())
1732 {
1733 return 4;
1734 }
1735
1736 if (pType->isHalfTy())
1737 {
1738 return 2;
1739 }
1740
1741 if (pType->isDoubleTy())
1742 {
1743 return 8;
1744 }
1745
1746 SWR_ASSERT(false, "Unimplemented type.");
1747 return 0;
1748 }
1749 }