swr: [rasterizer jitter] Fix MASKLOADD AVX prototype (float -> i32)
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "common/rdtsc_buckets.h"
32
33 #include "llvm/Support/DynamicLibrary.h"
34
35 void __cdecl CallPrint(const char* fmt, ...);
36
37 //////////////////////////////////////////////////////////////////////////
38 /// @brief Convert an IEEE 754 32-bit single precision float to an
39 /// 16 bit float with 5 exponent bits and a variable
40 /// number of mantissa bits.
41 /// @param val - 32-bit float
42 /// @todo Maybe move this outside of this file into a header?
43 static uint16_t Convert32To16Float(float val)
44 {
45 uint32_t sign, exp, mant;
46 uint32_t roundBits;
47
48 // Extract the sign, exponent, and mantissa
49 uint32_t uf = *(uint32_t*)&val;
50 sign = (uf & 0x80000000) >> 31;
51 exp = (uf & 0x7F800000) >> 23;
52 mant = uf & 0x007FFFFF;
53
54 // Check for out of range
55 if (std::isnan(val))
56 {
57 exp = 0x1F;
58 mant = 0x200;
59 sign = 1; // set the sign bit for NANs
60 }
61 else if (std::isinf(val))
62 {
63 exp = 0x1f;
64 mant = 0x0;
65 }
66 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
67 {
68 exp = 0x1E;
69 mant = 0x3FF;
70 }
71 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
72 {
73 mant |= 0x00800000;
74 for (; exp <= 0x70; mant >>= 1, exp++)
75 ;
76 exp = 0;
77 mant = mant >> 13;
78 }
79 else if (exp < 0x66) // Too small to represent -> Zero
80 {
81 exp = 0;
82 mant = 0;
83 }
84 else
85 {
86 // Saves bits that will be shifted off for rounding
87 roundBits = mant & 0x1FFFu;
88 // convert exponent and mantissa to 16 bit format
89 exp = exp - 0x70;
90 mant = mant >> 13;
91
92 // Essentially RTZ, but round up if off by only 1 lsb
93 if (roundBits == 0x1FFFu)
94 {
95 mant++;
96 // check for overflow
97 if ((mant & 0xC00u) != 0)
98 exp++;
99 // make sure only the needed bits are used
100 mant &= 0x3FF;
101 }
102 }
103
104 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
105 return (uint16_t)tmpVal;
106 }
107
108 //////////////////////////////////////////////////////////////////////////
109 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
110 /// float
111 /// @param val - 16-bit float
112 /// @todo Maybe move this outside of this file into a header?
113 static float ConvertSmallFloatTo32(UINT val)
114 {
115 UINT result;
116 if ((val & 0x7fff) == 0)
117 {
118 result = ((uint32_t)(val & 0x8000)) << 16;
119 }
120 else if ((val & 0x7c00) == 0x7c00)
121 {
122 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
123 result |= ((uint32_t)val & 0x8000) << 16;
124 }
125 else
126 {
127 uint32_t sign = (val & 0x8000) << 16;
128 uint32_t mant = (val & 0x3ff) << 13;
129 uint32_t exp = (val >> 10) & 0x1f;
130 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
131 {
132 mant <<= 1;
133 while (mant < (0x400 << 13))
134 {
135 exp--;
136 mant <<= 1;
137 }
138 mant &= (0x3ff << 13);
139 }
140 exp = ((exp - 15 + 127) & 0xff) << 23;
141 result = sign | exp | mant;
142 }
143
144 return *(float*)&result;
145 }
146
147 Constant *Builder::C(bool i)
148 {
149 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
150 }
151
152 Constant *Builder::C(char i)
153 {
154 return ConstantInt::get(IRB()->getInt8Ty(), i);
155 }
156
157 Constant *Builder::C(uint8_t i)
158 {
159 return ConstantInt::get(IRB()->getInt8Ty(), i);
160 }
161
162 Constant *Builder::C(int i)
163 {
164 return ConstantInt::get(IRB()->getInt32Ty(), i);
165 }
166
167 Constant *Builder::C(int64_t i)
168 {
169 return ConstantInt::get(IRB()->getInt64Ty(), i);
170 }
171
172 Constant *Builder::C(uint16_t i)
173 {
174 return ConstantInt::get(mInt16Ty,i);
175 }
176
177 Constant *Builder::C(uint32_t i)
178 {
179 return ConstantInt::get(IRB()->getInt32Ty(), i);
180 }
181
182 Constant *Builder::C(float i)
183 {
184 return ConstantFP::get(IRB()->getFloatTy(), i);
185 }
186
187 Constant *Builder::PRED(bool pred)
188 {
189 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
190 }
191
192 Value *Builder::VIMMED1(int i)
193 {
194 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
195 }
196
197 Value *Builder::VIMMED1(uint32_t i)
198 {
199 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
200 }
201
202 Value *Builder::VIMMED1(float i)
203 {
204 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
205 }
206
207 Value *Builder::VIMMED1(bool i)
208 {
209 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
210 }
211
212 Value *Builder::VUNDEF_IPTR()
213 {
214 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
215 }
216
217 Value *Builder::VUNDEF_I()
218 {
219 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
220 }
221
222 Value *Builder::VUNDEF(Type *ty, uint32_t size)
223 {
224 return UndefValue::get(VectorType::get(ty, size));
225 }
226
227 Value *Builder::VUNDEF_F()
228 {
229 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
230 }
231
232 Value *Builder::VUNDEF(Type* t)
233 {
234 return UndefValue::get(VectorType::get(t, mVWidth));
235 }
236
237 #if HAVE_LLVM == 0x306
238 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
239 {
240 return VINSERT(vec, val, C((int64_t)index));
241 }
242 #endif
243
244 Value *Builder::VBROADCAST(Value *src)
245 {
246 // check if src is already a vector
247 if (src->getType()->isVectorTy())
248 {
249 return src;
250 }
251
252 return VECTOR_SPLAT(mVWidth, src);
253 }
254
255 uint32_t Builder::IMMED(Value* v)
256 {
257 SWR_ASSERT(isa<ConstantInt>(v));
258 ConstantInt *pValConst = cast<ConstantInt>(v);
259 return pValConst->getZExtValue();
260 }
261
262 int32_t Builder::S_IMMED(Value* v)
263 {
264 SWR_ASSERT(isa<ConstantInt>(v));
265 ConstantInt *pValConst = cast<ConstantInt>(v);
266 return pValConst->getSExtValue();
267 }
268
269 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
270 {
271 std::vector<Value*> indices;
272 for (auto i : indexList)
273 indices.push_back(i);
274 return GEPA(ptr, indices);
275 }
276
277 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
278 {
279 std::vector<Value*> indices;
280 for (auto i : indexList)
281 indices.push_back(C(i));
282 return GEPA(ptr, indices);
283 }
284
285 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
286 {
287 std::vector<Value*> valIndices;
288 for (auto i : indices)
289 valIndices.push_back(C(i));
290 return LOAD(GEPA(basePtr, valIndices), name);
291 }
292
293 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
294 {
295 std::vector<Value*> valIndices;
296 for (auto i : indices)
297 valIndices.push_back(i);
298 return LOAD(GEPA(basePtr, valIndices), name);
299 }
300
301 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
302 {
303 std::vector<Value*> valIndices;
304 for (auto i : indices)
305 valIndices.push_back(C(i));
306 return STORE(val, GEPA(basePtr, valIndices));
307 }
308
309 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
310 {
311 std::vector<Value*> valIndices;
312 for (auto i : indices)
313 valIndices.push_back(i);
314 return STORE(val, GEPA(basePtr, valIndices));
315 }
316
317 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
318 {
319 std::vector<Value*> args;
320 for (auto arg : argsList)
321 args.push_back(arg);
322 return CALLA(Callee, args);
323 }
324
325 Value *Builder::VRCP(Value *va)
326 {
327 return FDIV(VIMMED1(1.0f), va); // 1 / a
328 }
329
330 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
331 {
332 Value* vOut = FMADDPS(vA, vX, vC);
333 vOut = FMADDPS(vB, vY, vOut);
334 return vOut;
335 }
336
337 //////////////////////////////////////////////////////////////////////////
338 /// @brief Generate an i32 masked load operation in LLVM IR. If not
339 /// supported on the underlying platform, emulate it with float masked load
340 /// @param src - base address pointer for the load
341 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
342 Value *Builder::MASKLOADD(Value* src,Value* mask)
343 {
344 Value* vResult;
345 // use avx2 gather instruction is available
346 if(JM()->mArch.AVX2())
347 {
348 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
349 vResult = CALL(func,{src,mask});
350 }
351 else
352 {
353 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
354 Value* fMask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
355 vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth));
356 }
357 return vResult;
358 }
359
360 //////////////////////////////////////////////////////////////////////////
361 /// @brief insert a JIT call to CallPrint
362 /// - outputs formatted string to both stdout and VS output window
363 /// - DEBUG builds only
364 /// Usage example:
365 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
366 /// where C(lane) creates a constant value to print, and pIndex is the Value*
367 /// result from a GEP, printing out the pointer to memory
368 /// @param printStr - constant string to print, which includes format specifiers
369 /// @param printArgs - initializer list of Value*'s to print to std out
370 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
371 {
372 // push the arguments to CallPrint into a vector
373 std::vector<Value*> printCallArgs;
374 // save room for the format string. we still need to modify it for vectors
375 printCallArgs.resize(1);
376
377 // search through the format string for special processing
378 size_t pos = 0;
379 std::string tempStr(printStr);
380 pos = tempStr.find('%', pos);
381 auto v = printArgs.begin();
382
383 while ((pos != std::string::npos) && (v != printArgs.end()))
384 {
385 Value* pArg = *v;
386 Type* pType = pArg->getType();
387
388 if (tempStr[pos + 1] == 't')
389 {
390 if (pType->isVectorTy())
391 {
392 Type* pContainedType = pType->getContainedType(0);
393
394 std::string vectorFormatStr;
395
396 if (pContainedType->isFloatTy())
397 {
398 tempStr[pos + 1] = 'f'; // Ensure its %f
399 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(0)), mDoubleTy));
400
401 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
402 {
403 vectorFormatStr += "%f ";
404 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), mDoubleTy));
405 }
406 }
407 else if (pContainedType->isIntegerTy())
408 {
409 tempStr[pos + 1] = 'd'; // Ensure its %d
410 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
411
412 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
413 {
414 vectorFormatStr += "%d ";
415 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
416 }
417 }
418 else
419 {
420 SWR_ASSERT(0, "Unsupported tyep");
421 }
422
423 tempStr.insert(pos, vectorFormatStr);
424 pos += vectorFormatStr.size();
425 }
426 else
427 {
428 if (pType->isFloatTy())
429 {
430 tempStr[pos + 1] = 'f'; // Ensure its %f
431 printCallArgs.push_back(FP_EXT(pArg, mDoubleTy));
432 }
433 else if (pType->isIntegerTy())
434 {
435 tempStr[pos + 1] = 'd'; // Ensure its %d
436 printCallArgs.push_back(pArg);
437 }
438 }
439 }
440 else if (toupper(tempStr[pos + 1]) == 'X')
441 {
442 if (pType->isVectorTy())
443 {
444 tempStr[pos] = '0';
445 tempStr.insert(pos + 1, "x%08");
446
447 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
448
449 std::string vectorFormatStr;
450 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
451 {
452 vectorFormatStr += "0x%08X ";
453 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
454 }
455
456 tempStr.insert(pos, vectorFormatStr);
457 pos += vectorFormatStr.size();
458 }
459 else
460 {
461 tempStr[pos] = '0';
462 tempStr.insert(pos + 1, "x%08");
463 printCallArgs.push_back(pArg);
464 pos += 3;
465 }
466 }
467 // for %f we need to cast float Values to doubles so that they print out correctly
468 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
469 {
470 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
471 pos++;
472 }
473 // add special handling for %f and %d format specifiers to make printing llvm vector types easier
474 else if (pType->isVectorTy())
475 {
476 Type* pContainedType = pType->getContainedType(0);
477
478 if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
479 {
480 uint32_t i = 0;
481 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
482 {
483 tempStr.insert(pos, std::string("%f "));
484 pos += 3;
485 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
486 }
487 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
488 }
489 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
490 {
491 uint32_t i = 0;
492 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
493 {
494 tempStr.insert(pos, std::string("%d "));
495 pos += 3;
496 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
497 }
498 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
499 }
500 else
501 {
502 /// not a supported vector to print
503 /// @todo pointer types too
504 SWR_ASSERT(0);
505 }
506 }
507 else
508 {
509 printCallArgs.push_back(pArg);
510 }
511
512 // advance to the next arguement
513 v++;
514 pos = tempStr.find('%', ++pos);
515 }
516
517 // create global variable constant string
518 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
519 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
520 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
521
522 // get a pointer to the first character in the constant string array
523 std::vector<Constant*> geplist{C(0),C(0)};
524 #if HAVE_LLVM == 0x306
525 Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
526 #else
527 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
528 #endif
529
530 // insert the pointer to the format string in the argument vector
531 printCallArgs[0] = strGEP;
532
533 // get pointer to CallPrint function and insert decl into the module if needed
534 std::vector<Type*> args;
535 args.push_back(PointerType::get(mInt8Ty,0));
536 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
537 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
538
539 // if we haven't yet added the symbol to the symbol table
540 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
541 {
542 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
543 }
544
545 // insert a call to CallPrint
546 return CALLA(callPrintFn,printCallArgs);
547 }
548
549 //////////////////////////////////////////////////////////////////////////
550 /// @brief Wrapper around PRINT with initializer list.
551 CallInst* Builder::PRINT(const std::string &printStr)
552 {
553 return PRINT(printStr, {});
554 }
555
556 //////////////////////////////////////////////////////////////////////////
557 /// @brief Generate a masked gather operation in LLVM IR. If not
558 /// supported on the underlying platform, emulate it with loads
559 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
560 /// @param pBase - Int8* base VB address pointer value
561 /// @param vIndices - SIMD wide value of VB byte offsets
562 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
563 /// @param scale - value to scale indices by
564 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
565 {
566 Value* vGather;
567
568 // use avx2 gather instruction if available
569 if(JM()->mArch.AVX2())
570 {
571 // force mask to <N x float>, required by vgather
572 vMask = BITCAST(vMask, mSimdFP32Ty);
573 vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
574 }
575 else
576 {
577 Value* pStack = STACKSAVE();
578
579 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
580 Value* vSrcPtr = ALLOCA(vSrc->getType());
581 STORE(vSrc, vSrcPtr);
582
583 vGather = VUNDEF_F();
584 Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
585 Value *vOffsets = MUL(vIndices,vScaleVec);
586 Value *mask = MASK(vMask);
587 for(uint32_t i = 0; i < mVWidth; ++i)
588 {
589 // single component byte index
590 Value *offset = VEXTRACT(vOffsets,C(i));
591 // byte pointer to component
592 Value *loadAddress = GEP(pBase,offset);
593 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
594 // pointer to the value to load if we're masking off a component
595 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
596 Value *selMask = VEXTRACT(mask,C(i));
597 // switch in a safe address to load if we're trying to access a vertex
598 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
599 Value *val = LOAD(validAddress);
600 vGather = VINSERT(vGather,val,C(i));
601 }
602 STACKRESTORE(pStack);
603 }
604
605 return vGather;
606 }
607
608 //////////////////////////////////////////////////////////////////////////
609 /// @brief Generate a masked gather operation in LLVM IR. If not
610 /// supported on the underlying platform, emulate it with loads
611 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
612 /// @param pBase - Int8* base VB address pointer value
613 /// @param vIndices - SIMD wide value of VB byte offsets
614 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
615 /// @param scale - value to scale indices by
616 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
617 {
618 Value* vGather;
619
620 // use avx2 gather instruction if available
621 if(JM()->mArch.AVX2())
622 {
623 vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
624 }
625 else
626 {
627 Value* pStack = STACKSAVE();
628
629 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
630 Value* vSrcPtr = ALLOCA(vSrc->getType());
631 STORE(vSrc, vSrcPtr);
632
633 vGather = VUNDEF_I();
634 Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
635 Value *vOffsets = MUL(vIndices, vScaleVec);
636 Value *mask = MASK(vMask);
637 for(uint32_t i = 0; i < mVWidth; ++i)
638 {
639 // single component byte index
640 Value *offset = VEXTRACT(vOffsets, C(i));
641 // byte pointer to component
642 Value *loadAddress = GEP(pBase, offset);
643 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
644 // pointer to the value to load if we're masking off a component
645 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
646 Value *selMask = VEXTRACT(mask, C(i));
647 // switch in a safe address to load if we're trying to access a vertex
648 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
649 Value *val = LOAD(validAddress, C(0));
650 vGather = VINSERT(vGather, val, C(i));
651 }
652
653 STACKRESTORE(pStack);
654 }
655 return vGather;
656 }
657
658 //////////////////////////////////////////////////////////////////////////
659 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
660 Value* Builder::MASK(Value* vmask)
661 {
662 Value* src = BITCAST(vmask, mSimdInt32Ty);
663 return ICMP_SLT(src, VIMMED1(0));
664 }
665
666 //////////////////////////////////////////////////////////////////////////
667 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
668 Value* Builder::VMASK(Value* mask)
669 {
670 return S_EXT(mask, mSimdInt32Ty);
671 }
672
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
675 /// supported on the underlying platform, emulate it
676 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
677 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
678 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
679 /// 128bits of a, and vice versa for the upper lanes. If the mask
680 /// value is negative, '0' is inserted.
681 Value *Builder::PSHUFB(Value* a, Value* b)
682 {
683 Value* res;
684 // use avx2 pshufb instruction if available
685 if(JM()->mArch.AVX2())
686 {
687 res = VPSHUFB(a, b);
688 }
689 else
690 {
691 Constant* cB = dyn_cast<Constant>(b);
692 // number of 8 bit elements in b
693 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
694 // output vector
695 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
696
697 // insert an 8 bit value from the high and low lanes of a per loop iteration
698 numElms /= 2;
699 for(uint32_t i = 0; i < numElms; i++)
700 {
701 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
702 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
703
704 // extract values from constant mask
705 char valLow128bLane = (char)(cLow128b->getSExtValue());
706 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
707
708 Value* insertValLow128b;
709 Value* insertValHigh128b;
710
711 // if the mask value is negative, insert a '0' in the respective output position
712 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
713 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
714 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
715
716 vShuf = VINSERT(vShuf, insertValLow128b, i);
717 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
718 }
719 res = vShuf;
720 }
721 return res;
722 }
723
724 //////////////////////////////////////////////////////////////////////////
725 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
726 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
727 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
728 /// lower 8 values are used.
729 Value *Builder::PMOVSXBD(Value* a)
730 {
731 Value* res;
732 // use avx2 byte sign extend instruction if available
733 if(JM()->mArch.AVX2())
734 {
735 res = VPMOVSXBD(a);
736 }
737 else
738 {
739 // VPMOVSXBD output type
740 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
741 // Extract 8 values from 128bit lane and sign extend
742 res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
743 }
744 return res;
745 }
746
747 //////////////////////////////////////////////////////////////////////////
748 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
749 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
750 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
751 Value *Builder::PMOVSXWD(Value* a)
752 {
753 Value* res;
754 // use avx2 word sign extend if available
755 if(JM()->mArch.AVX2())
756 {
757 res = VPMOVSXWD(a);
758 }
759 else
760 {
761 // VPMOVSXWD output type
762 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
763 // Extract 8 values from 128bit lane and sign extend
764 res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
765 }
766 return res;
767 }
768
769 //////////////////////////////////////////////////////////////////////////
770 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
771 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
772 /// platform, emulate it
773 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
774 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
775 Value *Builder::PERMD(Value* a, Value* idx)
776 {
777 Value* res;
778 // use avx2 permute instruction if available
779 if(JM()->mArch.AVX2())
780 {
781 // llvm 3.6.0 swapped the order of the args to vpermd
782 res = VPERMD(idx, a);
783 }
784 else
785 {
786 if (isa<Constant>(idx))
787 {
788 res = VSHUFFLE(a, a, idx);
789 }
790 else
791 {
792 res = VUNDEF_I();
793 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
794 {
795 Value* pIndex = VEXTRACT(idx, C(l));
796 Value* pVal = VEXTRACT(a, pIndex);
797 res = VINSERT(res, pVal, C(l));
798 }
799 }
800 }
801 return res;
802 }
803
804 //////////////////////////////////////////////////////////////////////////
805 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
806 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
807 /// platform, emulate it
808 /// @param a - 256bit SIMD lane(8x32bit) of float values.
809 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
810 Value *Builder::PERMPS(Value* a, Value* idx)
811 {
812 Value* res;
813 // use avx2 permute instruction if available
814 if (JM()->mArch.AVX2())
815 {
816 // llvm 3.6.0 swapped the order of the args to vpermd
817 res = VPERMPS(idx, a);
818 }
819 else
820 {
821 if (isa<Constant>(idx))
822 {
823 res = VSHUFFLE(a, a, idx);
824 }
825 else
826 {
827 res = VUNDEF_F();
828 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
829 {
830 Value* pIndex = VEXTRACT(idx, C(l));
831 Value* pVal = VEXTRACT(a, pIndex);
832 res = VINSERT(res, pVal, C(l));
833 }
834 }
835 }
836
837 return res;
838 }
839
840 //////////////////////////////////////////////////////////////////////////
841 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
842 /// in LLVM IR. If not supported on the underlying platform, emulate it
843 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
844 Value *Builder::CVTPH2PS(Value* a)
845 {
846 if (JM()->mArch.F16C())
847 {
848 return VCVTPH2PS(a);
849 }
850 else
851 {
852 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
853 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
854
855 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
856 {
857 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
858 }
859
860 Value* pResult = UndefValue::get(mSimdFP32Ty);
861 for (uint32_t i = 0; i < mVWidth; ++i)
862 {
863 Value* pSrc = VEXTRACT(a, C(i));
864 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
865 pResult = VINSERT(pResult, pConv, C(i));
866 }
867
868 return pResult;
869 }
870 }
871
872 //////////////////////////////////////////////////////////////////////////
873 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
874 /// in LLVM IR. If not supported on the underlying platform, emulate it
875 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
876 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
877 {
878 if (JM()->mArch.F16C())
879 {
880 return VCVTPS2PH(a, rounding);
881 }
882 else
883 {
884 // call scalar C function for now
885 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
886 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
887
888 if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
889 {
890 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
891 }
892
893 Value* pResult = UndefValue::get(mSimdInt16Ty);
894 for (uint32_t i = 0; i < mVWidth; ++i)
895 {
896 Value* pSrc = VEXTRACT(a, C(i));
897 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
898 pResult = VINSERT(pResult, pConv, C(i));
899 }
900
901 return pResult;
902 }
903 }
904
905 Value *Builder::PMAXSD(Value* a, Value* b)
906 {
907 if (JM()->mArch.AVX2())
908 {
909 return VPMAXSD(a, b);
910 }
911 else
912 {
913 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
914 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
915
916 // low 128
917 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
918 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
919 Value* resLo = CALL(pmaxsd, {aLo, bLo});
920
921 // high 128
922 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
923 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
924 Value* resHi = CALL(pmaxsd, {aHi, bHi});
925
926 // combine
927 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
928 result = VINSERTI128(result, resHi, C((uint8_t)1));
929
930 return result;
931 }
932 }
933
934 Value *Builder::PMINSD(Value* a, Value* b)
935 {
936 if (JM()->mArch.AVX2())
937 {
938 return VPMINSD(a, b);
939 }
940 else
941 {
942 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
943 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
944
945 // low 128
946 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
947 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
948 Value* resLo = CALL(pminsd, {aLo, bLo});
949
950 // high 128
951 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
952 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
953 Value* resHi = CALL(pminsd, {aHi, bHi});
954
955 // combine
956 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
957 result = VINSERTI128(result, resHi, C((uint8_t)1));
958
959 return result;
960 }
961 }
962
963 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
964 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
965 {
966 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
967 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
968 {
969 // ensure our mask is the correct type
970 mask = BITCAST(mask, mSimdFP32Ty);
971 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
972 }
973 else
974 {
975 // ensure our mask is the correct type
976 mask = BITCAST(mask, mSimdInt32Ty);
977 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
978 }
979 }
980
981 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
982 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
983 {
984 switch(info.bpp / info.numComps)
985 {
986 case 16:
987 {
988 Value* vGatherResult[2];
989 Value *vMask;
990
991 // TODO: vGatherMaskedVal
992 Value* vGatherMaskedVal = VIMMED1((float)0);
993
994 // always have at least one component out of x or y to fetch
995
996 // save mask as it is zero'd out after each gather
997 vMask = mask;
998
999 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1000 // e.g. result of first 8x32bit integer gather for 16bit components
1001 // 256i - 0 1 2 3 4 5 6 7
1002 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1003 //
1004
1005 // if we have at least one component out of x or y to fetch
1006 if(info.numComps > 2)
1007 {
1008 // offset base to the next components(zw) in the vertex to gather
1009 pSrcBase = GEP(pSrcBase, C((char)4));
1010 vMask = mask;
1011
1012 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1013 // e.g. result of second 8x32bit integer gather for 16bit components
1014 // 256i - 0 1 2 3 4 5 6 7
1015 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1016 //
1017 }
1018 else
1019 {
1020 vGatherResult[1] = vGatherMaskedVal;
1021 }
1022
1023 // Shuffle gathered components into place, each row is a component
1024 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1025 }
1026 break;
1027 case 32:
1028 {
1029 // apply defaults
1030 for (uint32_t i = 0; i < 4; ++i)
1031 {
1032 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1033 }
1034
1035 for(uint32_t i = 0; i < info.numComps; i++)
1036 {
1037 uint32_t swizzleIndex = info.swizzle[i];
1038
1039 // save mask as it is zero'd out after each gather
1040 Value *vMask = mask;
1041
1042 // Gather a SIMD of components
1043 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1044
1045 // offset base to the next component to gather
1046 pSrcBase = GEP(pSrcBase, C((char)4));
1047 }
1048 }
1049 break;
1050 default:
1051 SWR_ASSERT(0, "Invalid float format");
1052 break;
1053 }
1054 }
1055
1056 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1057 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1058 {
1059 switch (info.bpp / info.numComps)
1060 {
1061 case 8:
1062 {
1063 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1064 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1065 // e.g. result of an 8x32bit integer gather for 8bit components
1066 // 256i - 0 1 2 3 4 5 6 7
1067 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1068
1069 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1070 }
1071 break;
1072 case 16:
1073 {
1074 Value* vGatherResult[2];
1075 Value *vMask;
1076
1077 // TODO: vGatherMaskedVal
1078 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1079
1080 // always have at least one component out of x or y to fetch
1081
1082 // save mask as it is zero'd out after each gather
1083 vMask = mask;
1084
1085 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1086 // e.g. result of first 8x32bit integer gather for 16bit components
1087 // 256i - 0 1 2 3 4 5 6 7
1088 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1089 //
1090
1091 // if we have at least one component out of x or y to fetch
1092 if(info.numComps > 2)
1093 {
1094 // offset base to the next components(zw) in the vertex to gather
1095 pSrcBase = GEP(pSrcBase, C((char)4));
1096 vMask = mask;
1097
1098 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1099 // e.g. result of second 8x32bit integer gather for 16bit components
1100 // 256i - 0 1 2 3 4 5 6 7
1101 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1102 //
1103 }
1104 else
1105 {
1106 vGatherResult[1] = vGatherMaskedVal;
1107 }
1108
1109 // Shuffle gathered components into place, each row is a component
1110 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1111
1112 }
1113 break;
1114 case 32:
1115 {
1116 // apply defaults
1117 for (uint32_t i = 0; i < 4; ++i)
1118 {
1119 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1120 }
1121
1122 for(uint32_t i = 0; i < info.numComps; i++)
1123 {
1124 uint32_t swizzleIndex = info.swizzle[i];
1125
1126 // save mask as it is zero'd out after each gather
1127 Value *vMask = mask;
1128
1129 // Gather a SIMD of components
1130 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1131
1132 // offset base to the next component to gather
1133 pSrcBase = GEP(pSrcBase, C((char)4));
1134 }
1135 }
1136 break;
1137 default:
1138 SWR_ASSERT(0, "unsupported format");
1139 break;
1140 }
1141 }
1142
1143 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1144 {
1145 // cast types
1146 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1147 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1148
1149 // input could either be float or int vector; do shuffle work in int
1150 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1151 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1152
1153 if(bPackedOutput)
1154 {
1155 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1156
1157 // shuffle mask
1158 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1159 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1160 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1161 // after pshufb: group components together in each 128bit lane
1162 // 256i - 0 1 2 3 4 5 6 7
1163 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1164
1165 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1166 // after PERMD: move and pack xy components into each 128bit lane
1167 // 256i - 0 1 2 3 4 5 6 7
1168 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1169
1170 // do the same for zw components
1171 Value* vi128ZW = nullptr;
1172 if(info.numComps > 2)
1173 {
1174 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1175 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1176 }
1177
1178 for(uint32_t i = 0; i < 4; i++)
1179 {
1180 uint32_t swizzleIndex = info.swizzle[i];
1181 // todo: fixed for packed
1182 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1183 if(i >= info.numComps)
1184 {
1185 // set the default component val
1186 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1187 continue;
1188 }
1189
1190 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1191 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1192 // if x or y, use vi128XY permute result, else use vi128ZW
1193 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1194
1195 // extract packed component 128 bit lanes
1196 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1197 }
1198
1199 }
1200 else
1201 {
1202 // pshufb masks for each component
1203 Value* vConstMask[2];
1204 // x/z shuffle mask
1205 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1206 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1207
1208 // y/w shuffle mask
1209 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1210 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1211
1212
1213 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1214 // apply defaults
1215 for (uint32_t i = 0; i < 4; ++i)
1216 {
1217 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1218 }
1219
1220 for(uint32_t i = 0; i < info.numComps; i++)
1221 {
1222 uint32_t swizzleIndex = info.swizzle[i];
1223
1224 // select correct constMask for x/z or y/w pshufb
1225 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1226 // if x or y, use vi128XY permute result, else use vi128ZW
1227 uint32_t selectedGather = (i < 2) ? 0 : 1;
1228
1229 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1230 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1231 // 256i - 0 1 2 3 4 5 6 7
1232 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1233 }
1234 }
1235 }
1236
1237 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1238 {
1239 // cast types
1240 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1241 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1242
1243 if(bPackedOutput)
1244 {
1245 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1246 // shuffle mask
1247 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1248 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1249 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1250 // after pshufb: group components together in each 128bit lane
1251 // 256i - 0 1 2 3 4 5 6 7
1252 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1253
1254 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1255 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1256 // 256i - 0 1 2 3 4 5 6 7
1257 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1258
1259 // do the same for zw components
1260 Value* vi128ZW = nullptr;
1261 if(info.numComps > 2)
1262 {
1263 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1264 }
1265
1266 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1267 for(uint32_t i = 0; i < 4; i++)
1268 {
1269 uint32_t swizzleIndex = info.swizzle[i];
1270 // todo: fix for packed
1271 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1272 if(i >= info.numComps)
1273 {
1274 // set the default component val
1275 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1276 continue;
1277 }
1278
1279 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1280 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1281 // if x or y, use vi128XY permute result, else use vi128ZW
1282 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1283
1284 // sign extend
1285 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1286 }
1287 }
1288 // else zero extend
1289 else{
1290 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1291 // apply defaults
1292 for (uint32_t i = 0; i < 4; ++i)
1293 {
1294 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1295 }
1296
1297 for(uint32_t i = 0; i < info.numComps; i++){
1298 uint32_t swizzleIndex = info.swizzle[i];
1299
1300 // pshufb masks for each component
1301 Value* vConstMask;
1302 switch(i)
1303 {
1304 case 0:
1305 // x shuffle mask
1306 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1307 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1308 break;
1309 case 1:
1310 // y shuffle mask
1311 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1312 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1313 break;
1314 case 2:
1315 // z shuffle mask
1316 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1317 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1318 break;
1319 case 3:
1320 // w shuffle mask
1321 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1322 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1323 break;
1324 default:
1325 vConstMask = nullptr;
1326 break;
1327 }
1328
1329 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1330 // after pshufb for x channel
1331 // 256i - 0 1 2 3 4 5 6 7
1332 // x000 x000 x000 x000 x000 x000 x000 x000
1333 }
1334 }
1335 }
1336
1337 //////////////////////////////////////////////////////////////////////////
1338 /// @brief emulates a scatter operation.
1339 /// @param pDst - pointer to destination
1340 /// @param vSrc - vector of src data to scatter
1341 /// @param vOffsets - vector of byte offsets from pDst
1342 /// @param vMask - mask of valid lanes
1343 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1344 {
1345 Value* pStack = STACKSAVE();
1346
1347 Type* pSrcTy = vSrc->getType()->getVectorElementType();
1348
1349 // allocate tmp stack for masked off lanes
1350 Value* vTmpPtr = ALLOCA(pSrcTy);
1351
1352 Value *mask = MASK(vMask);
1353 for (uint32_t i = 0; i < mVWidth; ++i)
1354 {
1355 Value *offset = VEXTRACT(vOffsets, C(i));
1356 // byte pointer to component
1357 Value *storeAddress = GEP(pDst, offset);
1358 storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
1359 Value *selMask = VEXTRACT(mask, C(i));
1360 Value *srcElem = VEXTRACT(vSrc, C(i));
1361 // switch in a safe address to load if we're trying to access a vertex
1362 Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
1363 STORE(srcElem, validAddress);
1364 }
1365
1366 STACKRESTORE(pStack);
1367 }
1368
1369 Value* Builder::VABSPS(Value* a)
1370 {
1371 Value* asInt = BITCAST(a, mSimdInt32Ty);
1372 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1373 return result;
1374 }
1375
1376 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1377 {
1378 Value *lowCmp = ICMP_SLT(src, low);
1379 Value *ret = SELECT(lowCmp, low, src);
1380
1381 Value *highCmp = ICMP_SGT(ret, high);
1382 ret = SELECT(highCmp, high, ret);
1383
1384 return ret;
1385 }
1386
1387 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1388 {
1389 Value *lowCmp = FCMP_OLT(src, low);
1390 Value *ret = SELECT(lowCmp, low, src);
1391
1392 Value *highCmp = FCMP_OGT(ret, high);
1393 ret = SELECT(highCmp, high, ret);
1394
1395 return ret;
1396 }
1397
1398 Value *Builder::FCLAMP(Value* src, float low, float high)
1399 {
1400 Value* result = VMAXPS(src, VIMMED1(low));
1401 result = VMINPS(result, VIMMED1(high));
1402
1403 return result;
1404 }
1405
1406 //////////////////////////////////////////////////////////////////////////
1407 /// @brief save/restore stack, providing ability to push/pop the stack and
1408 /// reduce overall stack requirements for temporary stack use
1409 Value* Builder::STACKSAVE()
1410 {
1411 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1412 #if HAVE_LLVM == 0x306
1413 return CALL(pfnStackSave);
1414 #else
1415 return CALLA(pfnStackSave);
1416 #endif
1417 }
1418
1419 void Builder::STACKRESTORE(Value* pSaved)
1420 {
1421 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1422 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1423 }
1424
1425 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1426 {
1427 Value* vOut;
1428 // use FMADs if available
1429 if(JM()->mArch.AVX2())
1430 {
1431 vOut = VFMADDPS(a, b, c);
1432 }
1433 else
1434 {
1435 vOut = FADD(FMUL(a, b), c);
1436 }
1437 return vOut;
1438 }
1439
1440 Value* Builder::POPCNT(Value* a)
1441 {
1442 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1443 return CALL(pCtPop, std::initializer_list<Value*>{a});
1444 }
1445
1446 //////////////////////////////////////////////////////////////////////////
1447 /// @brief C functions called by LLVM IR
1448 //////////////////////////////////////////////////////////////////////////
1449
1450 //////////////////////////////////////////////////////////////////////////
1451 /// @brief called in JIT code, inserted by PRINT
1452 /// output to both stdout and visual studio debug console
1453 void __cdecl CallPrint(const char* fmt, ...)
1454 {
1455 va_list args;
1456 va_start(args, fmt);
1457 vprintf(fmt, args);
1458
1459 #if defined( _WIN32 )
1460 char strBuf[1024];
1461 vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1462 OutputDebugString(strBuf);
1463 #endif
1464
1465 va_end(args);
1466 }
1467
1468 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1469 {
1470 #if HAVE_LLVM == 0x306
1471 Function *func =
1472 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1473 Intrinsic::x86_avx_vextractf128_si_256);
1474 return CALL(func, {a, imm8});
1475 #else
1476 bool flag = !imm8->isZeroValue();
1477 SmallVector<Constant*,8> idx;
1478 for (unsigned i = 0; i < mVWidth / 2; i++) {
1479 idx.push_back(C(flag ? i + mVWidth / 2 : i));
1480 }
1481 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1482 #endif
1483 }
1484
1485 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1486 {
1487 #if HAVE_LLVM == 0x306
1488 Function *func =
1489 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1490 Intrinsic::x86_avx_vinsertf128_si_256);
1491 return CALL(func, {a, b, imm8});
1492 #else
1493 bool flag = !imm8->isZeroValue();
1494 SmallVector<Constant*,8> idx;
1495 for (unsigned i = 0; i < mVWidth; i++) {
1496 idx.push_back(C(i));
1497 }
1498 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1499
1500 SmallVector<Constant*,8> idx2;
1501 for (unsigned i = 0; i < mVWidth / 2; i++) {
1502 idx2.push_back(C(flag ? i : i + mVWidth));
1503 }
1504 for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1505 idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1506 }
1507 return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1508 #endif
1509 }
1510
1511 // rdtsc buckets macros
1512 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1513 {
1514 std::vector<Type*> args{
1515 PointerType::get(mInt32Ty, 0), // pBucketMgr
1516 mInt32Ty // id
1517 };
1518
1519 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1520 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1521 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1522 {
1523 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1524 }
1525
1526 CALL(pFunc, { pBucketMgr, pId });
1527 }
1528
1529 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1530 {
1531 std::vector<Type*> args{
1532 PointerType::get(mInt32Ty, 0), // pBucketMgr
1533 mInt32Ty // id
1534 };
1535
1536 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1537 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1538 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1539 {
1540 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1541 }
1542
1543 CALL(pFunc, { pBucketMgr, pId });
1544 }
1545