142007eb3412afcdcf57c91da49fec3d28540b55
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "common/rdtsc_buckets.h"
33
34 #include <cstdarg>
35
36 namespace SwrJit
37 {
38 void __cdecl CallPrint(const char* fmt, ...);
39
40 //////////////////////////////////////////////////////////////////////////
41 /// @brief Convert an IEEE 754 32-bit single precision float to an
42 /// 16 bit float with 5 exponent bits and a variable
43 /// number of mantissa bits.
44 /// @param val - 32-bit float
45 /// @todo Maybe move this outside of this file into a header?
46 static uint16_t ConvertFloat32ToFloat16(float val)
47 {
48 uint32_t sign, exp, mant;
49 uint32_t roundBits;
50
51 // Extract the sign, exponent, and mantissa
52 uint32_t uf = *(uint32_t*)&val;
53 sign = (uf & 0x80000000) >> 31;
54 exp = (uf & 0x7F800000) >> 23;
55 mant = uf & 0x007FFFFF;
56
57 // Check for out of range
58 if (std::isnan(val))
59 {
60 exp = 0x1F;
61 mant = 0x200;
62 sign = 1; // set the sign bit for NANs
63 }
64 else if (std::isinf(val))
65 {
66 exp = 0x1f;
67 mant = 0x0;
68 }
69 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
70 {
71 exp = 0x1E;
72 mant = 0x3FF;
73 }
74 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
75 {
76 mant |= 0x00800000;
77 for (; exp <= 0x70; mant >>= 1, exp++)
78 ;
79 exp = 0;
80 mant = mant >> 13;
81 }
82 else if (exp < 0x66) // Too small to represent -> Zero
83 {
84 exp = 0;
85 mant = 0;
86 }
87 else
88 {
89 // Saves bits that will be shifted off for rounding
90 roundBits = mant & 0x1FFFu;
91 // convert exponent and mantissa to 16 bit format
92 exp = exp - 0x70;
93 mant = mant >> 13;
94
95 // Essentially RTZ, but round up if off by only 1 lsb
96 if (roundBits == 0x1FFFu)
97 {
98 mant++;
99 // check for overflow
100 if ((mant & 0xC00u) != 0)
101 exp++;
102 // make sure only the needed bits are used
103 mant &= 0x3FF;
104 }
105 }
106
107 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
108 return (uint16_t)tmpVal;
109 }
110
111 //////////////////////////////////////////////////////////////////////////
112 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
113 /// float
114 /// @param val - 16-bit float
115 /// @todo Maybe move this outside of this file into a header?
116 static float ConvertFloat16ToFloat32(uint32_t val)
117 {
118 uint32_t result;
119 if ((val & 0x7fff) == 0)
120 {
121 result = ((uint32_t)(val & 0x8000)) << 16;
122 }
123 else if ((val & 0x7c00) == 0x7c00)
124 {
125 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
126 result |= ((uint32_t)val & 0x8000) << 16;
127 }
128 else
129 {
130 uint32_t sign = (val & 0x8000) << 16;
131 uint32_t mant = (val & 0x3ff) << 13;
132 uint32_t exp = (val >> 10) & 0x1f;
133 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
134 {
135 mant <<= 1;
136 while (mant < (0x400 << 13))
137 {
138 exp--;
139 mant <<= 1;
140 }
141 mant &= (0x3ff << 13);
142 }
143 exp = ((exp - 15 + 127) & 0xff) << 23;
144 result = sign | exp | mant;
145 }
146
147 return *(float*)&result;
148 }
149
150 Constant *Builder::C(bool i)
151 {
152 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
153 }
154
155 Constant *Builder::C(char i)
156 {
157 return ConstantInt::get(IRB()->getInt8Ty(), i);
158 }
159
160 Constant *Builder::C(uint8_t i)
161 {
162 return ConstantInt::get(IRB()->getInt8Ty(), i);
163 }
164
165 Constant *Builder::C(int i)
166 {
167 return ConstantInt::get(IRB()->getInt32Ty(), i);
168 }
169
170 Constant *Builder::C(int64_t i)
171 {
172 return ConstantInt::get(IRB()->getInt64Ty(), i);
173 }
174
175 Constant *Builder::C(uint16_t i)
176 {
177 return ConstantInt::get(mInt16Ty,i);
178 }
179
180 Constant *Builder::C(uint32_t i)
181 {
182 return ConstantInt::get(IRB()->getInt32Ty(), i);
183 }
184
185 Constant *Builder::C(float i)
186 {
187 return ConstantFP::get(IRB()->getFloatTy(), i);
188 }
189
190 Constant *Builder::PRED(bool pred)
191 {
192 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
193 }
194
195 Value *Builder::VIMMED1(int i)
196 {
197 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
198 }
199
200 Value *Builder::VIMMED1_16(int i)
201 {
202 return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
203 }
204
205 Value *Builder::VIMMED1(uint32_t i)
206 {
207 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
208 }
209
210 Value *Builder::VIMMED1_16(uint32_t i)
211 {
212 return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
213 }
214
215 Value *Builder::VIMMED1(float i)
216 {
217 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
218 }
219
220 Value *Builder::VIMMED1_16(float i)
221 {
222 return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
223 }
224
225 Value *Builder::VIMMED1(bool i)
226 {
227 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
228 }
229
230 Value *Builder::VIMMED1_16(bool i)
231 {
232 return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
233 }
234
235 Value *Builder::VUNDEF_IPTR()
236 {
237 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
238 }
239
240 Value *Builder::VUNDEF(Type* t)
241 {
242 return UndefValue::get(VectorType::get(t, mVWidth));
243 }
244
245 Value *Builder::VUNDEF_I()
246 {
247 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
248 }
249
250 Value *Builder::VUNDEF_I_16()
251 {
252 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16));
253 }
254
255 Value *Builder::VUNDEF_F()
256 {
257 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
258 }
259
260 Value *Builder::VUNDEF_F_16()
261 {
262 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16));
263 }
264
265 Value *Builder::VUNDEF(Type *ty, uint32_t size)
266 {
267 return UndefValue::get(VectorType::get(ty, size));
268 }
269
270 Value *Builder::VBROADCAST(Value *src, const llvm::Twine& name)
271 {
272 // check if src is already a vector
273 if (src->getType()->isVectorTy())
274 {
275 return src;
276 }
277
278 return VECTOR_SPLAT(mVWidth, src, name);
279 }
280
281 Value *Builder::VBROADCAST_16(Value *src)
282 {
283 // check if src is already a vector
284 if (src->getType()->isVectorTy())
285 {
286 return src;
287 }
288
289 return VECTOR_SPLAT(mVWidth16, src);
290 }
291
292 uint32_t Builder::IMMED(Value* v)
293 {
294 SWR_ASSERT(isa<ConstantInt>(v));
295 ConstantInt *pValConst = cast<ConstantInt>(v);
296 return pValConst->getZExtValue();
297 }
298
299 int32_t Builder::S_IMMED(Value* v)
300 {
301 SWR_ASSERT(isa<ConstantInt>(v));
302 ConstantInt *pValConst = cast<ConstantInt>(v);
303 return pValConst->getSExtValue();
304 }
305
306 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
307 {
308 std::vector<Value*> indices;
309 for (auto i : indexList)
310 indices.push_back(i);
311 return GEPA(ptr, indices);
312 }
313
314 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
315 {
316 std::vector<Value*> indices;
317 for (auto i : indexList)
318 indices.push_back(C(i));
319 return GEPA(ptr, indices);
320 }
321
322 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
323 {
324 std::vector<Value*> indices;
325 for (auto i : indexList)
326 indices.push_back(i);
327 return IN_BOUNDS_GEP(ptr, indices);
328 }
329
330 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
331 {
332 std::vector<Value*> indices;
333 for (auto i : indexList)
334 indices.push_back(C(i));
335 return IN_BOUNDS_GEP(ptr, indices);
336 }
337
338 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
339 {
340 std::vector<Value*> valIndices;
341 for (auto i : indices)
342 valIndices.push_back(C(i));
343 return LOAD(GEPA(basePtr, valIndices), name);
344 }
345
346 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
347 {
348 std::vector<Value*> valIndices;
349 for (auto i : indices)
350 valIndices.push_back(i);
351 return LOAD(GEPA(basePtr, valIndices), name);
352 }
353
354 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
355 {
356 std::vector<Value*> valIndices;
357 for (auto i : indices)
358 valIndices.push_back(C(i));
359 return STORE(val, GEPA(basePtr, valIndices));
360 }
361
362 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
363 {
364 std::vector<Value*> valIndices;
365 for (auto i : indices)
366 valIndices.push_back(i);
367 return STORE(val, GEPA(basePtr, valIndices));
368 }
369
370 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name)
371 {
372 std::vector<Value*> args;
373 for (auto arg : argsList)
374 args.push_back(arg);
375 return CALLA(Callee, args, name);
376 }
377
378 CallInst *Builder::CALL(Value *Callee, Value* arg)
379 {
380 std::vector<Value*> args;
381 args.push_back(arg);
382 return CALLA(Callee, args);
383 }
384
385 CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
386 {
387 std::vector<Value*> args;
388 args.push_back(arg1);
389 args.push_back(arg2);
390 return CALLA(Callee, args);
391 }
392
393 CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
394 {
395 std::vector<Value*> args;
396 args.push_back(arg1);
397 args.push_back(arg2);
398 args.push_back(arg3);
399 return CALLA(Callee, args);
400 }
401
402 //////////////////////////////////////////////////////////////////////////
403 Value *Builder::DEBUGTRAP()
404 {
405 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
406 return CALL(func);
407 }
408
409 Value *Builder::VRCP(Value *va, const llvm::Twine& name)
410 {
411 return FDIV(VIMMED1(1.0f), va, name); // 1 / a
412 }
413
414 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
415 {
416 Value* vOut = FMADDPS(vA, vX, vC);
417 vOut = FMADDPS(vB, vY, vOut);
418 return vOut;
419 }
420
421 //////////////////////////////////////////////////////////////////////////
422 /// @brief Generate an i32 masked load operation in LLVM IR. If not
423 /// supported on the underlying platform, emulate it with float masked load
424 /// @param src - base address pointer for the load
425 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
426 Value *Builder::MASKLOADD(Value* src,Value* mask)
427 {
428 Value* vResult;
429 // use avx2 gather instruction is available
430 if(JM()->mArch.AVX2())
431 {
432 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
433 vResult = CALL(func,{src,mask});
434 }
435 else
436 {
437 // maskload intrinsic expects integer mask operand in llvm >= 3.8
438 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
439 mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
440 #else
441 mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
442 #endif
443 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
444 vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
445 }
446 return vResult;
447 }
448
449 //////////////////////////////////////////////////////////////////////////
450 /// @brief insert a JIT call to CallPrint
451 /// - outputs formatted string to both stdout and VS output window
452 /// - DEBUG builds only
453 /// Usage example:
454 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
455 /// where C(lane) creates a constant value to print, and pIndex is the Value*
456 /// result from a GEP, printing out the pointer to memory
457 /// @param printStr - constant string to print, which includes format specifiers
458 /// @param printArgs - initializer list of Value*'s to print to std out
459 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
460 {
461 // push the arguments to CallPrint into a vector
462 std::vector<Value*> printCallArgs;
463 // save room for the format string. we still need to modify it for vectors
464 printCallArgs.resize(1);
465
466 // search through the format string for special processing
467 size_t pos = 0;
468 std::string tempStr(printStr);
469 pos = tempStr.find('%', pos);
470 auto v = printArgs.begin();
471
472 while ((pos != std::string::npos) && (v != printArgs.end()))
473 {
474 Value* pArg = *v;
475 Type* pType = pArg->getType();
476
477 if (pType->isVectorTy())
478 {
479 Type* pContainedType = pType->getContainedType(0);
480
481 if (toupper(tempStr[pos + 1]) == 'X')
482 {
483 tempStr[pos] = '0';
484 tempStr[pos + 1] = 'x';
485 tempStr.insert(pos + 2, "%08X ");
486 pos += 7;
487
488 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
489
490 std::string vectorFormatStr;
491 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
492 {
493 vectorFormatStr += "0x%08X ";
494 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
495 }
496
497 tempStr.insert(pos, vectorFormatStr);
498 pos += vectorFormatStr.size();
499 }
500 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
501 {
502 uint32_t i = 0;
503 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
504 {
505 tempStr.insert(pos, std::string("%f "));
506 pos += 3;
507 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
508 }
509 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
510 }
511 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
512 {
513 uint32_t i = 0;
514 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
515 {
516 tempStr.insert(pos, std::string("%d "));
517 pos += 3;
518 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
519 }
520 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
521 }
522 }
523 else
524 {
525 if (toupper(tempStr[pos + 1]) == 'X')
526 {
527 tempStr[pos] = '0';
528 tempStr.insert(pos + 1, "x%08");
529 printCallArgs.push_back(pArg);
530 pos += 3;
531 }
532 // for %f we need to cast float Values to doubles so that they print out correctly
533 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
534 {
535 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
536 pos++;
537 }
538 else
539 {
540 printCallArgs.push_back(pArg);
541 }
542 }
543
544 // advance to the next arguement
545 v++;
546 pos = tempStr.find('%', ++pos);
547 }
548
549 // create global variable constant string
550 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
551 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
552 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
553
554 // get a pointer to the first character in the constant string array
555 std::vector<Constant*> geplist{C(0),C(0)};
556 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
557
558 // insert the pointer to the format string in the argument vector
559 printCallArgs[0] = strGEP;
560
561 // get pointer to CallPrint function and insert decl into the module if needed
562 std::vector<Type*> args;
563 args.push_back(PointerType::get(mInt8Ty,0));
564 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
565 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
566
567 // if we haven't yet added the symbol to the symbol table
568 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
569 {
570 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
571 }
572
573 // insert a call to CallPrint
574 return CALLA(callPrintFn,printCallArgs);
575 }
576
577 //////////////////////////////////////////////////////////////////////////
578 /// @brief Wrapper around PRINT with initializer list.
579 CallInst* Builder::PRINT(const std::string &printStr)
580 {
581 return PRINT(printStr, {});
582 }
583
584 //////////////////////////////////////////////////////////////////////////
585 /// @brief Generate a masked gather operation in LLVM IR. If not
586 /// supported on the underlying platform, emulate it with loads
587 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
588 /// @param pBase - Int8* base VB address pointer value
589 /// @param vIndices - SIMD wide value of VB byte offsets
590 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
591 /// @param scale - value to scale indices by
592 Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
593 {
594 Value *vGather;
595
596 // use avx2 gather instruction if available
597 if(JM()->mArch.AVX2())
598 {
599 // force mask to <N x float>, required by vgather
600 Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
601
602 vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
603 }
604 else
605 {
606 Value* pStack = STACKSAVE();
607
608 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
609 Value* vSrcPtr = ALLOCA(vSrc->getType());
610 STORE(vSrc, vSrcPtr);
611
612 vGather = VUNDEF_F();
613 Value *vScaleVec = VIMMED1((uint32_t)scale);
614 Value *vOffsets = MUL(vIndices,vScaleVec);
615 for(uint32_t i = 0; i < mVWidth; ++i)
616 {
617 // single component byte index
618 Value *offset = VEXTRACT(vOffsets,C(i));
619 // byte pointer to component
620 Value *loadAddress = GEP(pBase,offset);
621 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
622 // pointer to the value to load if we're masking off a component
623 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
624 Value *selMask = VEXTRACT(vMask,C(i));
625 // switch in a safe address to load if we're trying to access a vertex
626 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
627 Value *val = LOAD(validAddress);
628 vGather = VINSERT(vGather,val,C(i));
629 }
630
631 STACKRESTORE(pStack);
632 }
633
634 return vGather;
635 }
636
637 Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
638 {
639 Value *vGather = VUNDEF_F_16();
640
641 // use AVX512F gather instruction if available
642 if (JM()->mArch.AVX512F())
643 {
644 // force mask to <N-bit Integer>, required by vgather2
645 Value *mask = BITCAST(vMask, mInt16Ty);
646
647 vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
648 }
649 else
650 {
651 Value *src0 = EXTRACT_16(vSrc, 0);
652 Value *src1 = EXTRACT_16(vSrc, 1);
653
654 Value *indices0 = EXTRACT_16(vIndices, 0);
655 Value *indices1 = EXTRACT_16(vIndices, 1);
656
657 Value *mask0 = EXTRACT_16(vMask, 0);
658 Value *mask1 = EXTRACT_16(vMask, 1);
659
660 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
661 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
662
663 vGather = JOIN_16(gather0, gather1);
664 }
665
666 return vGather;
667 }
668
669 //////////////////////////////////////////////////////////////////////////
670 /// @brief Generate a masked gather operation in LLVM IR. If not
671 /// supported on the underlying platform, emulate it with loads
672 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
673 /// @param pBase - Int8* base VB address pointer value
674 /// @param vIndices - SIMD wide value of VB byte offsets
675 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
676 /// @param scale - value to scale indices by
677 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
678 {
679 Value* vGather;
680
681 // use avx2 gather instruction if available
682 if(JM()->mArch.AVX2())
683 {
684 vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
685 }
686 else
687 {
688 Value* pStack = STACKSAVE();
689
690 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
691 Value* vSrcPtr = ALLOCA(vSrc->getType());
692 STORE(vSrc, vSrcPtr);
693
694 vGather = VUNDEF_I();
695 Value *vScaleVec = VIMMED1((uint32_t)scale);
696 Value *vOffsets = MUL(vIndices, vScaleVec);
697 for(uint32_t i = 0; i < mVWidth; ++i)
698 {
699 // single component byte index
700 Value *offset = VEXTRACT(vOffsets, C(i));
701 // byte pointer to component
702 Value *loadAddress = GEP(pBase, offset);
703 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
704 // pointer to the value to load if we're masking off a component
705 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
706 Value *selMask = VEXTRACT(vMask, C(i));
707 // switch in a safe address to load if we're trying to access a vertex
708 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
709 Value *val = LOAD(validAddress, C(0));
710 vGather = VINSERT(vGather, val, C(i));
711 }
712
713 STACKRESTORE(pStack);
714 }
715
716 return vGather;
717 }
718
719 Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
720 {
721 Value *vGather = VUNDEF_I_16();
722
723 // use AVX512F gather instruction if available
724 if (JM()->mArch.AVX512F())
725 {
726 // force mask to <N-bit Integer>, required by vgather2
727 Value *mask = BITCAST(vMask, mInt16Ty);
728
729 vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
730 }
731 else
732 {
733 Value *src0 = EXTRACT_16(vSrc, 0);
734 Value *src1 = EXTRACT_16(vSrc, 1);
735
736 Value *indices0 = EXTRACT_16(vIndices, 0);
737 Value *indices1 = EXTRACT_16(vIndices, 1);
738
739 Value *mask0 = EXTRACT_16(vMask, 0);
740 Value *mask1 = EXTRACT_16(vMask, 1);
741
742 Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
743 Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
744
745 vGather = JOIN_16(gather0, gather1);
746 }
747
748 return vGather;
749 }
750
751 //////////////////////////////////////////////////////////////////////////
752 /// @brief Generate a masked gather operation in LLVM IR. If not
753 /// supported on the underlying platform, emulate it with loads
754 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
755 /// @param pBase - Int8* base VB address pointer value
756 /// @param vIndices - SIMD wide value of VB byte offsets
757 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
758 /// @param scale - value to scale indices by
759 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
760 {
761 Value* vGather;
762
763 // use avx2 gather instruction if available
764 if(JM()->mArch.AVX2())
765 {
766 vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
767 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
768 }
769 else
770 {
771 Value* pStack = STACKSAVE();
772
773 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
774 Value* vSrcPtr = ALLOCA(vSrc->getType());
775 STORE(vSrc, vSrcPtr);
776
777 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
778 Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
779 Value *vOffsets = MUL(vIndices,vScaleVec);
780 for(uint32_t i = 0; i < mVWidth/2; ++i)
781 {
782 // single component byte index
783 Value *offset = VEXTRACT(vOffsets,C(i));
784 // byte pointer to component
785 Value *loadAddress = GEP(pBase,offset);
786 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
787 // pointer to the value to load if we're masking off a component
788 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
789 Value *selMask = VEXTRACT(vMask,C(i));
790 // switch in a safe address to load if we're trying to access a vertex
791 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
792 Value *val = LOAD(validAddress);
793 vGather = VINSERT(vGather,val,C(i));
794 }
795 STACKRESTORE(pStack);
796 }
797 return vGather;
798 }
799
800 Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
801 {
802 if (imm == 0)
803 {
804 return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 });
805 }
806 else
807 {
808 return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 });
809 }
810 }
811
812 Value *Builder::JOIN_16(Value *a, Value *b)
813 {
814 return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
815 }
816
817 //////////////////////////////////////////////////////////////////////////
818 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
819 Value *Builder::MASK(Value *vmask)
820 {
821 Value *src = BITCAST(vmask, mSimdInt32Ty);
822 return ICMP_SLT(src, VIMMED1(0));
823 }
824
825 Value *Builder::MASK_16(Value *vmask)
826 {
827 Value *src = BITCAST(vmask, mSimd16Int32Ty);
828 return ICMP_SLT(src, VIMMED1_16(0));
829 }
830
831 //////////////////////////////////////////////////////////////////////////
832 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
833 Value *Builder::VMASK(Value *mask)
834 {
835 return S_EXT(mask, mSimdInt32Ty);
836 }
837
838 Value *Builder::VMASK_16(Value *mask)
839 {
840 return S_EXT(mask, mSimd16Int32Ty);
841 }
842
843 //////////////////////////////////////////////////////////////////////////
844 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
845 /// supported on the underlying platform, emulate it
846 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
847 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
848 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
849 /// 128bits of a, and vice versa for the upper lanes. If the mask
850 /// value is negative, '0' is inserted.
851 Value *Builder::PSHUFB(Value* a, Value* b)
852 {
853 Value* res;
854 // use avx2 pshufb instruction if available
855 if(JM()->mArch.AVX2())
856 {
857 res = VPSHUFB(a, b);
858 }
859 else
860 {
861 Constant* cB = dyn_cast<Constant>(b);
862 // number of 8 bit elements in b
863 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
864 // output vector
865 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
866
867 // insert an 8 bit value from the high and low lanes of a per loop iteration
868 numElms /= 2;
869 for(uint32_t i = 0; i < numElms; i++)
870 {
871 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
872 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
873
874 // extract values from constant mask
875 char valLow128bLane = (char)(cLow128b->getSExtValue());
876 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
877
878 Value* insertValLow128b;
879 Value* insertValHigh128b;
880
881 // if the mask value is negative, insert a '0' in the respective output position
882 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
883 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
884 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
885
886 vShuf = VINSERT(vShuf, insertValLow128b, i);
887 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
888 }
889 res = vShuf;
890 }
891 return res;
892 }
893
894 //////////////////////////////////////////////////////////////////////////
895 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
896 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
897 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
898 /// lower 8 values are used.
899 Value *Builder::PMOVSXBD(Value* a)
900 {
901 // VPMOVSXBD output type
902 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
903 // Extract 8 values from 128bit lane and sign extend
904 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
905 }
906
907 //////////////////////////////////////////////////////////////////////////
908 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
909 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
910 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
911 Value *Builder::PMOVSXWD(Value* a)
912 {
913 // VPMOVSXWD output type
914 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
915 // Extract 8 values from 128bit lane and sign extend
916 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
917 }
918
919 //////////////////////////////////////////////////////////////////////////
920 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
921 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
922 /// platform, emulate it
923 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
924 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
925 Value *Builder::PERMD(Value* a, Value* idx)
926 {
927 Value* res;
928 // use avx2 permute instruction if available
929 if(JM()->mArch.AVX2())
930 {
931 res = VPERMD(a, idx);
932 }
933 else
934 {
935 if (isa<Constant>(idx))
936 {
937 res = VSHUFFLE(a, a, idx);
938 }
939 else
940 {
941 res = VUNDEF_I();
942 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
943 {
944 Value* pIndex = VEXTRACT(idx, C(l));
945 Value* pVal = VEXTRACT(a, pIndex);
946 res = VINSERT(res, pVal, C(l));
947 }
948 }
949 }
950 return res;
951 }
952
953 //////////////////////////////////////////////////////////////////////////
954 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
955 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
956 /// platform, emulate it
957 /// @param a - 256bit SIMD lane(8x32bit) of float values.
958 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
959 Value *Builder::PERMPS(Value* a, Value* idx)
960 {
961 Value* res;
962 // use avx2 permute instruction if available
963 if (JM()->mArch.AVX2())
964 {
965 // llvm 3.6.0 swapped the order of the args to vpermd
966 res = VPERMPS(idx, a);
967 }
968 else
969 {
970 if (isa<Constant>(idx))
971 {
972 res = VSHUFFLE(a, a, idx);
973 }
974 else
975 {
976 res = VUNDEF_F();
977 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
978 {
979 Value* pIndex = VEXTRACT(idx, C(l));
980 Value* pVal = VEXTRACT(a, pIndex);
981 res = VINSERT(res, pVal, C(l));
982 }
983 }
984 }
985
986 return res;
987 }
988
989 //////////////////////////////////////////////////////////////////////////
990 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
991 /// in LLVM IR. If not supported on the underlying platform, emulate it
992 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
993 Value *Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
994 {
995 if (JM()->mArch.F16C())
996 {
997 return VCVTPH2PS(a, name);
998 }
999 else
1000 {
1001 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
1002 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
1003
1004 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
1005 {
1006 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
1007 }
1008
1009 Value* pResult = UndefValue::get(mSimdFP32Ty);
1010 for (uint32_t i = 0; i < mVWidth; ++i)
1011 {
1012 Value* pSrc = VEXTRACT(a, C(i));
1013 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
1014 pResult = VINSERT(pResult, pConv, C(i));
1015 }
1016
1017 pResult->setName(name);
1018 return pResult;
1019 }
1020 }
1021
1022 //////////////////////////////////////////////////////////////////////////
1023 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
1024 /// in LLVM IR. If not supported on the underlying platform, emulate it
1025 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
1026 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
1027 {
1028 if (JM()->mArch.F16C())
1029 {
1030 return VCVTPS2PH(a, rounding);
1031 }
1032 else
1033 {
1034 // call scalar C function for now
1035 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
1036 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
1037
1038 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
1039 {
1040 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
1041 }
1042
1043 Value* pResult = UndefValue::get(mSimdInt16Ty);
1044 for (uint32_t i = 0; i < mVWidth; ++i)
1045 {
1046 Value* pSrc = VEXTRACT(a, C(i));
1047 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
1048 pResult = VINSERT(pResult, pConv, C(i));
1049 }
1050
1051 return pResult;
1052 }
1053 }
1054
1055 Value *Builder::PMAXSD(Value* a, Value* b)
1056 {
1057 Value* cmp = ICMP_SGT(a, b);
1058 return SELECT(cmp, a, b);
1059 }
1060
1061 Value *Builder::PMINSD(Value* a, Value* b)
1062 {
1063 Value* cmp = ICMP_SLT(a, b);
1064 return SELECT(cmp, a, b);
1065 }
1066
1067 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1068 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1069 {
1070 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1071 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1072 {
1073 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1074 }
1075 else
1076 {
1077 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1078 }
1079 }
1080
1081 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1082 Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1083 {
1084 switch(info.bpp / info.numComps)
1085 {
1086 case 16:
1087 {
1088 Value* vGatherResult[2];
1089
1090 // TODO: vGatherMaskedVal
1091 Value* vGatherMaskedVal = VIMMED1((float)0);
1092
1093 // always have at least one component out of x or y to fetch
1094
1095 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1096 // e.g. result of first 8x32bit integer gather for 16bit components
1097 // 256i - 0 1 2 3 4 5 6 7
1098 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1099 //
1100
1101 // if we have at least one component out of x or y to fetch
1102 if(info.numComps > 2)
1103 {
1104 // offset base to the next components(zw) in the vertex to gather
1105 pSrcBase = GEP(pSrcBase, C((char)4));
1106
1107 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1108 // e.g. result of second 8x32bit integer gather for 16bit components
1109 // 256i - 0 1 2 3 4 5 6 7
1110 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1111 //
1112 }
1113 else
1114 {
1115 vGatherResult[1] = vGatherMaskedVal;
1116 }
1117
1118 // Shuffle gathered components into place, each row is a component
1119 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1120 }
1121 break;
1122 case 32:
1123 {
1124 // apply defaults
1125 for (uint32_t i = 0; i < 4; ++i)
1126 {
1127 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1128 }
1129
1130 for(uint32_t i = 0; i < info.numComps; i++)
1131 {
1132 uint32_t swizzleIndex = info.swizzle[i];
1133
1134 // Gather a SIMD of components
1135 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1136
1137 // offset base to the next component to gather
1138 pSrcBase = GEP(pSrcBase, C((char)4));
1139 }
1140 }
1141 break;
1142 default:
1143 SWR_INVALID("Invalid float format");
1144 break;
1145 }
1146 }
1147
1148 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1149 Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1150 {
1151 switch (info.bpp / info.numComps)
1152 {
1153 case 8:
1154 {
1155 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1156 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1157 // e.g. result of an 8x32bit integer gather for 8bit components
1158 // 256i - 0 1 2 3 4 5 6 7
1159 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1160
1161 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1162 }
1163 break;
1164 case 16:
1165 {
1166 Value* vGatherResult[2];
1167
1168 // TODO: vGatherMaskedVal
1169 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1170
1171 // always have at least one component out of x or y to fetch
1172
1173 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1174 // e.g. result of first 8x32bit integer gather for 16bit components
1175 // 256i - 0 1 2 3 4 5 6 7
1176 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1177 //
1178
1179 // if we have at least one component out of x or y to fetch
1180 if(info.numComps > 2)
1181 {
1182 // offset base to the next components(zw) in the vertex to gather
1183 pSrcBase = GEP(pSrcBase, C((char)4));
1184
1185 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1186 // e.g. result of second 8x32bit integer gather for 16bit components
1187 // 256i - 0 1 2 3 4 5 6 7
1188 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1189 //
1190 }
1191 else
1192 {
1193 vGatherResult[1] = vGatherMaskedVal;
1194 }
1195
1196 // Shuffle gathered components into place, each row is a component
1197 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1198
1199 }
1200 break;
1201 case 32:
1202 {
1203 // apply defaults
1204 for (uint32_t i = 0; i < 4; ++i)
1205 {
1206 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1207 }
1208
1209 for(uint32_t i = 0; i < info.numComps; i++)
1210 {
1211 uint32_t swizzleIndex = info.swizzle[i];
1212
1213 // Gather a SIMD of components
1214 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1215
1216 // offset base to the next component to gather
1217 pSrcBase = GEP(pSrcBase, C((char)4));
1218 }
1219 }
1220 break;
1221 default:
1222 SWR_INVALID("unsupported format");
1223 break;
1224 }
1225 }
1226
1227 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1228 {
1229 // cast types
1230 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1231 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1232
1233 // input could either be float or int vector; do shuffle work in int
1234 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1235 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1236
1237 if(bPackedOutput)
1238 {
1239 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1240
1241 // shuffle mask
1242 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1243 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1244 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1245 // after pshufb: group components together in each 128bit lane
1246 // 256i - 0 1 2 3 4 5 6 7
1247 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1248
1249 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1250 // after PERMD: move and pack xy components into each 128bit lane
1251 // 256i - 0 1 2 3 4 5 6 7
1252 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1253
1254 // do the same for zw components
1255 Value* vi128ZW = nullptr;
1256 if(info.numComps > 2)
1257 {
1258 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1259 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1260 }
1261
1262 for(uint32_t i = 0; i < 4; i++)
1263 {
1264 uint32_t swizzleIndex = info.swizzle[i];
1265 // todo: fixed for packed
1266 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1267 if(i >= info.numComps)
1268 {
1269 // set the default component val
1270 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1271 continue;
1272 }
1273
1274 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1275 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1276 // if x or y, use vi128XY permute result, else use vi128ZW
1277 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1278
1279 // extract packed component 128 bit lanes
1280 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1281 }
1282
1283 }
1284 else
1285 {
1286 // pshufb masks for each component
1287 Value* vConstMask[2];
1288 // x/z shuffle mask
1289 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1290 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1291
1292 // y/w shuffle mask
1293 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1294 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1295
1296
1297 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1298 // apply defaults
1299 for (uint32_t i = 0; i < 4; ++i)
1300 {
1301 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1302 }
1303
1304 for(uint32_t i = 0; i < info.numComps; i++)
1305 {
1306 uint32_t swizzleIndex = info.swizzle[i];
1307
1308 // select correct constMask for x/z or y/w pshufb
1309 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1310 // if x or y, use vi128XY permute result, else use vi128ZW
1311 uint32_t selectedGather = (i < 2) ? 0 : 1;
1312
1313 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1314 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1315 // 256i - 0 1 2 3 4 5 6 7
1316 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1317 }
1318 }
1319 }
1320
1321 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1322 {
1323 // cast types
1324 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1325 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1326
1327 if(bPackedOutput)
1328 {
1329 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1330 // shuffle mask
1331 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1332 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1333 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1334 // after pshufb: group components together in each 128bit lane
1335 // 256i - 0 1 2 3 4 5 6 7
1336 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1337
1338 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1339 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1340 // 256i - 0 1 2 3 4 5 6 7
1341 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1342
1343 // do the same for zw components
1344 Value* vi128ZW = nullptr;
1345 if(info.numComps > 2)
1346 {
1347 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1348 }
1349
1350 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1351 for(uint32_t i = 0; i < 4; i++)
1352 {
1353 uint32_t swizzleIndex = info.swizzle[i];
1354 // todo: fix for packed
1355 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1356 if(i >= info.numComps)
1357 {
1358 // set the default component val
1359 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1360 continue;
1361 }
1362
1363 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1364 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1365 // if x or y, use vi128XY permute result, else use vi128ZW
1366 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1367
1368 // sign extend
1369 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1370 }
1371 }
1372 // else zero extend
1373 else{
1374 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1375 // apply defaults
1376 for (uint32_t i = 0; i < 4; ++i)
1377 {
1378 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1379 }
1380
1381 for(uint32_t i = 0; i < info.numComps; i++){
1382 uint32_t swizzleIndex = info.swizzle[i];
1383
1384 // pshufb masks for each component
1385 Value* vConstMask;
1386 switch(i)
1387 {
1388 case 0:
1389 // x shuffle mask
1390 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1391 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1392 break;
1393 case 1:
1394 // y shuffle mask
1395 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1396 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1397 break;
1398 case 2:
1399 // z shuffle mask
1400 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1401 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1402 break;
1403 case 3:
1404 // w shuffle mask
1405 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1406 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1407 break;
1408 default:
1409 vConstMask = nullptr;
1410 break;
1411 }
1412
1413 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1414 // after pshufb for x channel
1415 // 256i - 0 1 2 3 4 5 6 7
1416 // x000 x000 x000 x000 x000 x000 x000 x000
1417 }
1418 }
1419 }
1420
1421 // Helper function to create alloca in entry block of function
1422 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1423 {
1424 auto saveIP = IRB()->saveIP();
1425 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1426 pFunc->getEntryBlock().begin());
1427 Value* pAlloca = ALLOCA(pType);
1428 if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1429 return pAlloca;
1430 }
1431
1432 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
1433 {
1434 auto saveIP = IRB()->saveIP();
1435 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1436 pFunc->getEntryBlock().begin());
1437 Value* pAlloca = ALLOCA(pType, pArraySize);
1438 if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1439 return pAlloca;
1440 }
1441
1442 //////////////////////////////////////////////////////////////////////////
1443 /// @brief emulates a scatter operation.
1444 /// @param pDst - pointer to destination
1445 /// @param vSrc - vector of src data to scatter
1446 /// @param vOffsets - vector of byte offsets from pDst
1447 /// @param vMask - mask of valid lanes
1448 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1449 {
1450 /* Scatter algorithm
1451
1452 while(Index = BitScanForward(mask))
1453 srcElem = srcVector[Index]
1454 offsetElem = offsetVector[Index]
1455 *(pDst + offsetElem) = srcElem
1456 Update mask (&= ~(1<<Index)
1457
1458 */
1459
1460 BasicBlock* pCurBB = IRB()->GetInsertBlock();
1461 Function* pFunc = pCurBB->getParent();
1462 Type* pSrcTy = vSrc->getType()->getVectorElementType();
1463
1464 // Store vectors on stack
1465 if (pScatterStackSrc == nullptr)
1466 {
1467 // Save off stack allocations and reuse per scatter. Significantly reduces stack
1468 // requirements for shaders with a lot of scatters.
1469 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1470 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1471 }
1472
1473 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1474 Value* pOffsetsArrayPtr = pScatterStackOffsets;
1475 STORE(vSrc, pSrcArrayPtr);
1476 STORE(vOffsets, pOffsetsArrayPtr);
1477
1478 // Cast to pointers for random access
1479 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1480 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1481
1482 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1483
1484 // Get cttz function
1485 Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1486
1487 // Setup loop basic block
1488 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
1489
1490 // compute first set bit
1491 Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1492
1493 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1494
1495 // Split current block
1496 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1497
1498 // Remove unconditional jump created by splitBasicBlock
1499 pCurBB->getTerminator()->eraseFromParent();
1500
1501 // Add terminator to end of original block
1502 IRB()->SetInsertPoint(pCurBB);
1503
1504 // Add conditional branch
1505 COND_BR(pIsUndef, pPostLoop, pLoop);
1506
1507 // Add loop basic block contents
1508 IRB()->SetInsertPoint(pLoop);
1509 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1510 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1511
1512 pIndexPhi->addIncoming(pIndex, pCurBB);
1513 pMaskPhi->addIncoming(pMask, pCurBB);
1514
1515 // Extract elements for this index
1516 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1517 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1518
1519 // GEP to this offset in dst
1520 Value* pCurDst = GEP(pDst, pOffsetElem);
1521 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1522 STORE(pSrcElem, pCurDst);
1523
1524 // Update the mask
1525 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1526
1527 // Terminator
1528 Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1529
1530 pIsUndef = ICMP_EQ(pNewIndex, C(32));
1531 COND_BR(pIsUndef, pPostLoop, pLoop);
1532
1533 // Update phi edges
1534 pIndexPhi->addIncoming(pNewIndex, pLoop);
1535 pMaskPhi->addIncoming(pNewMask, pLoop);
1536
1537 // Move builder to beginning of post loop
1538 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1539 }
1540
1541 Value* Builder::VABSPS(Value* a)
1542 {
1543 Value* asInt = BITCAST(a, mSimdInt32Ty);
1544 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1545 return result;
1546 }
1547
1548 Value *Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
1549 {
1550 Value *lowCmp = ICMP_SLT(src, low);
1551 Value *ret = SELECT(lowCmp, low, src);
1552
1553 Value *highCmp = ICMP_SGT(ret, high);
1554 ret = SELECT(highCmp, high, ret, name);
1555
1556 return ret;
1557 }
1558
1559 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1560 {
1561 Value *lowCmp = FCMP_OLT(src, low);
1562 Value *ret = SELECT(lowCmp, low, src);
1563
1564 Value *highCmp = FCMP_OGT(ret, high);
1565 ret = SELECT(highCmp, high, ret);
1566
1567 return ret;
1568 }
1569
1570 Value *Builder::FCLAMP(Value* src, float low, float high)
1571 {
1572 Value* result = VMAXPS(src, VIMMED1(low));
1573 result = VMINPS(result, VIMMED1(high));
1574
1575 return result;
1576 }
1577
1578 //////////////////////////////////////////////////////////////////////////
1579 /// @brief save/restore stack, providing ability to push/pop the stack and
1580 /// reduce overall stack requirements for temporary stack use
1581 Value* Builder::STACKSAVE()
1582 {
1583 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1584 return CALLA(pfnStackSave);
1585 }
1586
1587 void Builder::STACKRESTORE(Value* pSaved)
1588 {
1589 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1590 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1591 }
1592
1593 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1594 {
1595 Value* vOut;
1596 // use FMADs if available
1597 if(JM()->mArch.AVX2())
1598 {
1599 vOut = VFMADDPS(a, b, c);
1600 }
1601 else
1602 {
1603 vOut = FADD(FMUL(a, b), c);
1604 }
1605 return vOut;
1606 }
1607
1608 Value* Builder::POPCNT(Value* a)
1609 {
1610 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1611 return CALL(pCtPop, std::initializer_list<Value*>{a});
1612 }
1613
1614 //////////////////////////////////////////////////////////////////////////
1615 /// @brief C functions called by LLVM IR
1616 //////////////////////////////////////////////////////////////////////////
1617
1618 //////////////////////////////////////////////////////////////////////////
1619 /// @brief called in JIT code, inserted by PRINT
1620 /// output to both stdout and visual studio debug console
1621 void __cdecl CallPrint(const char* fmt, ...)
1622 {
1623 va_list args;
1624 va_start(args, fmt);
1625 vprintf(fmt, args);
1626
1627 #if defined( _WIN32 )
1628 char strBuf[1024];
1629 vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1630 OutputDebugStringA(strBuf);
1631 #endif
1632
1633 va_end(args);
1634 }
1635
1636 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1637 {
1638 bool flag = !imm8->isZeroValue();
1639 SmallVector<Constant*,8> idx;
1640 for (unsigned i = 0; i < mVWidth / 2; i++) {
1641 idx.push_back(C(flag ? i + mVWidth / 2 : i));
1642 }
1643 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1644 }
1645
1646 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1647 {
1648 bool flag = !imm8->isZeroValue();
1649 SmallVector<Constant*,8> idx;
1650 for (unsigned i = 0; i < mVWidth; i++) {
1651 idx.push_back(C(i));
1652 }
1653 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1654
1655 SmallVector<Constant*,8> idx2;
1656 for (unsigned i = 0; i < mVWidth / 2; i++) {
1657 idx2.push_back(C(flag ? i : i + mVWidth));
1658 }
1659 for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1660 idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1661 }
1662 return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1663 }
1664
1665 // rdtsc buckets macros
1666 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1667 {
1668 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1669 // buckets framework when single threaded
1670 if (KNOB_SINGLE_THREADED)
1671 {
1672 std::vector<Type*> args{
1673 PointerType::get(mInt32Ty, 0), // pBucketMgr
1674 mInt32Ty // id
1675 };
1676
1677 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1678 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1679 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1680 {
1681 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1682 }
1683
1684 CALL(pFunc, { pBucketMgr, pId });
1685 }
1686 }
1687
1688 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1689 {
1690 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1691 // buckets framework when single threaded
1692 if (KNOB_SINGLE_THREADED)
1693 {
1694 std::vector<Type*> args{
1695 PointerType::get(mInt32Ty, 0), // pBucketMgr
1696 mInt32Ty // id
1697 };
1698
1699 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1700 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1701 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1702 {
1703 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1704 }
1705
1706 CALL(pFunc, { pBucketMgr, pId });
1707 }
1708 }
1709
1710
1711 uint32_t Builder::GetTypeSize(Type* pType)
1712 {
1713 if (pType->isStructTy())
1714 {
1715 uint32_t numElems = pType->getStructNumElements();
1716 Type* pElemTy = pType->getStructElementType(0);
1717 return numElems * GetTypeSize(pElemTy);
1718 }
1719
1720 if (pType->isArrayTy())
1721 {
1722 uint32_t numElems = pType->getArrayNumElements();
1723 Type* pElemTy = pType->getArrayElementType();
1724 return numElems * GetTypeSize(pElemTy);
1725 }
1726
1727 if (pType->isIntegerTy())
1728 {
1729 uint32_t bitSize = pType->getIntegerBitWidth();
1730 return bitSize / 8;
1731 }
1732
1733 if (pType->isFloatTy())
1734 {
1735 return 4;
1736 }
1737
1738 if (pType->isHalfTy())
1739 {
1740 return 2;
1741 }
1742
1743 if (pType->isDoubleTy())
1744 {
1745 return 8;
1746 }
1747
1748 SWR_ASSERT(false, "Unimplemented type.");
1749 return 0;
1750 }
1751 }