swr: [rasterizer jitter] vpermps support
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "common/rdtsc_buckets.h"
32
33 #include "llvm/Support/DynamicLibrary.h"
34
35 void __cdecl CallPrint(const char* fmt, ...);
36
37 //////////////////////////////////////////////////////////////////////////
38 /// @brief Convert an IEEE 754 32-bit single precision float to an
39 /// 16 bit float with 5 exponent bits and a variable
40 /// number of mantissa bits.
41 /// @param val - 32-bit float
42 /// @todo Maybe move this outside of this file into a header?
43 static uint16_t Convert32To16Float(float val)
44 {
45 uint32_t sign, exp, mant;
46 uint32_t roundBits;
47
48 // Extract the sign, exponent, and mantissa
49 uint32_t uf = *(uint32_t*)&val;
50 sign = (uf & 0x80000000) >> 31;
51 exp = (uf & 0x7F800000) >> 23;
52 mant = uf & 0x007FFFFF;
53
54 // Check for out of range
55 if (std::isnan(val))
56 {
57 exp = 0x1F;
58 mant = 0x200;
59 sign = 1; // set the sign bit for NANs
60 }
61 else if (std::isinf(val))
62 {
63 exp = 0x1f;
64 mant = 0x0;
65 }
66 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
67 {
68 exp = 0x1E;
69 mant = 0x3FF;
70 }
71 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
72 {
73 mant |= 0x00800000;
74 for (; exp <= 0x70; mant >>= 1, exp++)
75 ;
76 exp = 0;
77 mant = mant >> 13;
78 }
79 else if (exp < 0x66) // Too small to represent -> Zero
80 {
81 exp = 0;
82 mant = 0;
83 }
84 else
85 {
86 // Saves bits that will be shifted off for rounding
87 roundBits = mant & 0x1FFFu;
88 // convert exponent and mantissa to 16 bit format
89 exp = exp - 0x70;
90 mant = mant >> 13;
91
92 // Essentially RTZ, but round up if off by only 1 lsb
93 if (roundBits == 0x1FFFu)
94 {
95 mant++;
96 // check for overflow
97 if ((mant & 0xC00u) != 0)
98 exp++;
99 // make sure only the needed bits are used
100 mant &= 0x3FF;
101 }
102 }
103
104 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
105 return (uint16_t)tmpVal;
106 }
107
108 //////////////////////////////////////////////////////////////////////////
109 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
110 /// float
111 /// @param val - 16-bit float
112 /// @todo Maybe move this outside of this file into a header?
113 static float ConvertSmallFloatTo32(UINT val)
114 {
115 UINT result;
116 if ((val & 0x7fff) == 0)
117 {
118 result = ((uint32_t)(val & 0x8000)) << 16;
119 }
120 else if ((val & 0x7c00) == 0x7c00)
121 {
122 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
123 result |= ((uint32_t)val & 0x8000) << 16;
124 }
125 else
126 {
127 uint32_t sign = (val & 0x8000) << 16;
128 uint32_t mant = (val & 0x3ff) << 13;
129 uint32_t exp = (val >> 10) & 0x1f;
130 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
131 {
132 mant <<= 1;
133 while (mant < (0x400 << 13))
134 {
135 exp--;
136 mant <<= 1;
137 }
138 mant &= (0x3ff << 13);
139 }
140 exp = ((exp - 15 + 127) & 0xff) << 23;
141 result = sign | exp | mant;
142 }
143
144 return *(float*)&result;
145 }
146
147 Constant *Builder::C(bool i)
148 {
149 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
150 }
151
152 Constant *Builder::C(char i)
153 {
154 return ConstantInt::get(IRB()->getInt8Ty(), i);
155 }
156
157 Constant *Builder::C(uint8_t i)
158 {
159 return ConstantInt::get(IRB()->getInt8Ty(), i);
160 }
161
162 Constant *Builder::C(int i)
163 {
164 return ConstantInt::get(IRB()->getInt32Ty(), i);
165 }
166
167 Constant *Builder::C(int64_t i)
168 {
169 return ConstantInt::get(IRB()->getInt64Ty(), i);
170 }
171
172 Constant *Builder::C(uint16_t i)
173 {
174 return ConstantInt::get(mInt16Ty,i);
175 }
176
177 Constant *Builder::C(uint32_t i)
178 {
179 return ConstantInt::get(IRB()->getInt32Ty(), i);
180 }
181
182 Constant *Builder::C(float i)
183 {
184 return ConstantFP::get(IRB()->getFloatTy(), i);
185 }
186
187 Constant *Builder::PRED(bool pred)
188 {
189 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
190 }
191
192 Value *Builder::VIMMED1(int i)
193 {
194 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
195 }
196
197 Value *Builder::VIMMED1(uint32_t i)
198 {
199 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
200 }
201
202 Value *Builder::VIMMED1(float i)
203 {
204 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
205 }
206
207 Value *Builder::VIMMED1(bool i)
208 {
209 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
210 }
211
212 Value *Builder::VUNDEF_IPTR()
213 {
214 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
215 }
216
217 Value *Builder::VUNDEF_I()
218 {
219 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
220 }
221
222 Value *Builder::VUNDEF(Type *ty, uint32_t size)
223 {
224 return UndefValue::get(VectorType::get(ty, size));
225 }
226
227 Value *Builder::VUNDEF_F()
228 {
229 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
230 }
231
232 Value *Builder::VUNDEF(Type* t)
233 {
234 return UndefValue::get(VectorType::get(t, mVWidth));
235 }
236
237 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
238 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
239 {
240 return VINSERT(vec, val, C((int64_t)index));
241 }
242 #endif
243
244 Value *Builder::VBROADCAST(Value *src)
245 {
246 // check if src is already a vector
247 if (src->getType()->isVectorTy())
248 {
249 return src;
250 }
251
252 return VECTOR_SPLAT(mVWidth, src);
253 }
254
255 uint32_t Builder::IMMED(Value* v)
256 {
257 SWR_ASSERT(isa<ConstantInt>(v));
258 ConstantInt *pValConst = cast<ConstantInt>(v);
259 return pValConst->getZExtValue();
260 }
261
262 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
263 {
264 std::vector<Value*> indices;
265 for (auto i : indexList)
266 indices.push_back(i);
267 return GEPA(ptr, indices);
268 }
269
270 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
271 {
272 std::vector<Value*> indices;
273 for (auto i : indexList)
274 indices.push_back(C(i));
275 return GEPA(ptr, indices);
276 }
277
278 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
279 {
280 std::vector<Value*> valIndices;
281 for (auto i : indices)
282 valIndices.push_back(C(i));
283 return LOAD(GEPA(basePtr, valIndices), name);
284 }
285
286 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
287 {
288 std::vector<Value*> valIndices;
289 for (auto i : indices)
290 valIndices.push_back(i);
291 return LOAD(GEPA(basePtr, valIndices), name);
292 }
293
294 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
295 {
296 std::vector<Value*> valIndices;
297 for (auto i : indices)
298 valIndices.push_back(C(i));
299 return STORE(val, GEPA(basePtr, valIndices));
300 }
301
302 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
303 {
304 std::vector<Value*> valIndices;
305 for (auto i : indices)
306 valIndices.push_back(i);
307 return STORE(val, GEPA(basePtr, valIndices));
308 }
309
310 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
311 {
312 std::vector<Value*> args;
313 for (auto arg : argsList)
314 args.push_back(arg);
315 return CALLA(Callee, args);
316 }
317
318 Value *Builder::VRCP(Value *va)
319 {
320 return FDIV(VIMMED1(1.0f), va); // 1 / a
321 }
322
323 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
324 {
325 Value* vOut = FMADDPS(vA, vX, vC);
326 vOut = FMADDPS(vB, vY, vOut);
327 return vOut;
328 }
329
330 //////////////////////////////////////////////////////////////////////////
331 /// @brief Generate an i32 masked load operation in LLVM IR. If not
332 /// supported on the underlying platform, emulate it with float masked load
333 /// @param src - base address pointer for the load
334 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
335 Value *Builder::MASKLOADD(Value* src,Value* mask)
336 {
337 Value* vResult;
338 // use avx2 gather instruction is available
339 if(JM()->mArch.AVX2())
340 {
341 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
342 vResult = CALL(func,{src,mask});
343 }
344 else
345 {
346 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
347 Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
348 vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth));
349 }
350 return vResult;
351 }
352
353 //////////////////////////////////////////////////////////////////////////
354 /// @brief insert a JIT call to CallPrint
355 /// - outputs formatted string to both stdout and VS output window
356 /// - DEBUG builds only
357 /// Usage example:
358 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
359 /// where C(lane) creates a constant value to print, and pIndex is the Value*
360 /// result from a GEP, printing out the pointer to memory
361 /// @param printStr - constant string to print, which includes format specifiers
362 /// @param printArgs - initializer list of Value*'s to print to std out
363 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
364 {
365 // push the arguments to CallPrint into a vector
366 std::vector<Value*> printCallArgs;
367 // save room for the format string. we still need to modify it for vectors
368 printCallArgs.resize(1);
369
370 // search through the format string for special processing
371 size_t pos = 0;
372 std::string tempStr(printStr);
373 pos = tempStr.find('%', pos);
374 auto v = printArgs.begin();
375
376 while ((pos != std::string::npos) && (v != printArgs.end()))
377 {
378 Value* pArg = *v;
379 Type* pType = pArg->getType();
380
381 if (tempStr[pos + 1] == 't')
382 {
383 if (pType->isVectorTy())
384 {
385 Type* pContainedType = pType->getContainedType(0);
386
387 std::string vectorFormatStr;
388
389 if (pContainedType->isFloatTy())
390 {
391 tempStr[pos + 1] = 'f'; // Ensure its %f
392 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(0)), mDoubleTy));
393
394 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
395 {
396 vectorFormatStr += "%f ";
397 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), mDoubleTy));
398 }
399 }
400 else if (pContainedType->isIntegerTy())
401 {
402 tempStr[pos + 1] = 'd'; // Ensure its %d
403 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
404
405 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
406 {
407 vectorFormatStr += "%d ";
408 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
409 }
410 }
411 else
412 {
413 SWR_ASSERT(0, "Unsupported tyep");
414 }
415
416 tempStr.insert(pos, vectorFormatStr);
417 pos += vectorFormatStr.size();
418 }
419 else
420 {
421 if (pType->isFloatTy())
422 {
423 tempStr[pos + 1] = 'f'; // Ensure its %f
424 printCallArgs.push_back(FP_EXT(pArg, mDoubleTy));
425 }
426 else if (pType->isIntegerTy())
427 {
428 tempStr[pos + 1] = 'd'; // Ensure its %d
429 printCallArgs.push_back(pArg);
430 }
431 }
432 }
433 else if (toupper(tempStr[pos + 1]) == 'X')
434 {
435 if (pType->isVectorTy())
436 {
437 tempStr[pos] = '0';
438 tempStr.insert(pos + 1, "x%08");
439
440 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
441
442 std::string vectorFormatStr;
443 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
444 {
445 vectorFormatStr += "0x%08X ";
446 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
447 }
448
449 tempStr.insert(pos, vectorFormatStr);
450 pos += vectorFormatStr.size();
451 }
452 else
453 {
454 tempStr[pos] = '0';
455 tempStr.insert(pos + 1, "x%08");
456 printCallArgs.push_back(pArg);
457 pos += 3;
458 }
459 }
460 // for %f we need to cast float Values to doubles so that they print out correctly
461 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
462 {
463 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
464 pos++;
465 }
466 // add special handling for %f and %d format specifiers to make printing llvm vector types easier
467 else if (pType->isVectorTy())
468 {
469 Type* pContainedType = pType->getContainedType(0);
470
471 if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
472 {
473 uint32_t i = 0;
474 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
475 {
476 tempStr.insert(pos, std::string("%f "));
477 pos += 3;
478 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
479 }
480 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
481 }
482 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
483 {
484 uint32_t i = 0;
485 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
486 {
487 tempStr.insert(pos, std::string("%d "));
488 pos += 3;
489 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
490 }
491 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
492 }
493 else
494 {
495 /// not a supported vector to print
496 /// @todo pointer types too
497 SWR_ASSERT(0);
498 }
499 }
500 else
501 {
502 printCallArgs.push_back(pArg);
503 }
504
505 // advance to the next arguement
506 v++;
507 pos = tempStr.find('%', ++pos);
508 }
509
510 // create global variable constant string
511 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
512 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
513 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
514
515 // get a pointer to the first character in the constant string array
516 std::vector<Constant*> geplist{C(0),C(0)};
517 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
518 Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
519 #else
520 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
521 #endif
522
523 // insert the pointer to the format string in the argument vector
524 printCallArgs[0] = strGEP;
525
526 // get pointer to CallPrint function and insert decl into the module if needed
527 std::vector<Type*> args;
528 args.push_back(PointerType::get(mInt8Ty,0));
529 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
530 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
531
532 // if we haven't yet added the symbol to the symbol table
533 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
534 {
535 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
536 }
537
538 // insert a call to CallPrint
539 return CALLA(callPrintFn,printCallArgs);
540 }
541
542 //////////////////////////////////////////////////////////////////////////
543 /// @brief Wrapper around PRINT with initializer list.
544 CallInst* Builder::PRINT(const std::string &printStr)
545 {
546 return PRINT(printStr, {});
547 }
548
549 //////////////////////////////////////////////////////////////////////////
550 /// @brief Generate a masked gather operation in LLVM IR. If not
551 /// supported on the underlying platform, emulate it with loads
552 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
553 /// @param pBase - Int8* base VB address pointer value
554 /// @param vIndices - SIMD wide value of VB byte offsets
555 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
556 /// @param scale - value to scale indices by
557 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
558 {
559 Value* vGather;
560
561 // use avx2 gather instruction if available
562 if(JM()->mArch.AVX2())
563 {
564 // force mask to <N x float>, required by vgather
565 vMask = BITCAST(vMask, mSimdFP32Ty);
566 vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
567 }
568 else
569 {
570 Value* pStack = STACKSAVE();
571
572 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
573 Value* vSrcPtr = ALLOCA(vSrc->getType());
574 STORE(vSrc, vSrcPtr);
575
576 vGather = VUNDEF_F();
577 Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
578 Value *vOffsets = MUL(vIndices,vScaleVec);
579 Value *mask = MASK(vMask);
580 for(uint32_t i = 0; i < mVWidth; ++i)
581 {
582 // single component byte index
583 Value *offset = VEXTRACT(vOffsets,C(i));
584 // byte pointer to component
585 Value *loadAddress = GEP(pBase,offset);
586 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
587 // pointer to the value to load if we're masking off a component
588 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
589 Value *selMask = VEXTRACT(mask,C(i));
590 // switch in a safe address to load if we're trying to access a vertex
591 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
592 Value *val = LOAD(validAddress);
593 vGather = VINSERT(vGather,val,C(i));
594 }
595 STACKRESTORE(pStack);
596 }
597
598 return vGather;
599 }
600
601 //////////////////////////////////////////////////////////////////////////
602 /// @brief Generate a masked gather operation in LLVM IR. If not
603 /// supported on the underlying platform, emulate it with loads
604 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
605 /// @param pBase - Int8* base VB address pointer value
606 /// @param vIndices - SIMD wide value of VB byte offsets
607 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
608 /// @param scale - value to scale indices by
609 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
610 {
611 Value* vGather;
612
613 // use avx2 gather instruction if available
614 if(JM()->mArch.AVX2())
615 {
616 vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
617 }
618 else
619 {
620 Value* pStack = STACKSAVE();
621
622 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
623 Value* vSrcPtr = ALLOCA(vSrc->getType());
624 STORE(vSrc, vSrcPtr);
625
626 vGather = VUNDEF_I();
627 Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
628 Value *vOffsets = MUL(vIndices, vScaleVec);
629 Value *mask = MASK(vMask);
630 for(uint32_t i = 0; i < mVWidth; ++i)
631 {
632 // single component byte index
633 Value *offset = VEXTRACT(vOffsets, C(i));
634 // byte pointer to component
635 Value *loadAddress = GEP(pBase, offset);
636 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
637 // pointer to the value to load if we're masking off a component
638 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
639 Value *selMask = VEXTRACT(mask, C(i));
640 // switch in a safe address to load if we're trying to access a vertex
641 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
642 Value *val = LOAD(validAddress, C(0));
643 vGather = VINSERT(vGather, val, C(i));
644 }
645
646 STACKRESTORE(pStack);
647 }
648 return vGather;
649 }
650
651 //////////////////////////////////////////////////////////////////////////
652 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
653 Value* Builder::MASK(Value* vmask)
654 {
655 Value* src = BITCAST(vmask, mSimdInt32Ty);
656 return ICMP_SLT(src, VIMMED1(0));
657 }
658
659 //////////////////////////////////////////////////////////////////////////
660 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
661 Value* Builder::VMASK(Value* mask)
662 {
663 return S_EXT(mask, mSimdInt32Ty);
664 }
665
666 //////////////////////////////////////////////////////////////////////////
667 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
668 /// supported on the underlying platform, emulate it
669 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
670 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
671 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
672 /// 128bits of a, and vice versa for the upper lanes. If the mask
673 /// value is negative, '0' is inserted.
674 Value *Builder::PSHUFB(Value* a, Value* b)
675 {
676 Value* res;
677 // use avx2 pshufb instruction if available
678 if(JM()->mArch.AVX2())
679 {
680 res = VPSHUFB(a, b);
681 }
682 else
683 {
684 Constant* cB = dyn_cast<Constant>(b);
685 // number of 8 bit elements in b
686 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
687 // output vector
688 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
689
690 // insert an 8 bit value from the high and low lanes of a per loop iteration
691 numElms /= 2;
692 for(uint32_t i = 0; i < numElms; i++)
693 {
694 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
695 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
696
697 // extract values from constant mask
698 char valLow128bLane = (char)(cLow128b->getSExtValue());
699 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
700
701 Value* insertValLow128b;
702 Value* insertValHigh128b;
703
704 // if the mask value is negative, insert a '0' in the respective output position
705 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
706 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
707 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
708
709 vShuf = VINSERT(vShuf, insertValLow128b, i);
710 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
711 }
712 res = vShuf;
713 }
714 return res;
715 }
716
717 //////////////////////////////////////////////////////////////////////////
718 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
719 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
720 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
721 /// lower 8 values are used.
722 Value *Builder::PMOVSXBD(Value* a)
723 {
724 Value* res;
725 // use avx2 byte sign extend instruction if available
726 if(JM()->mArch.AVX2())
727 {
728 res = VPMOVSXBD(a);
729 }
730 else
731 {
732 // VPMOVSXBD output type
733 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
734 // Extract 8 values from 128bit lane and sign extend
735 res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
736 }
737 return res;
738 }
739
740 //////////////////////////////////////////////////////////////////////////
741 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
742 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
743 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
744 Value *Builder::PMOVSXWD(Value* a)
745 {
746 Value* res;
747 // use avx2 word sign extend if available
748 if(JM()->mArch.AVX2())
749 {
750 res = VPMOVSXWD(a);
751 }
752 else
753 {
754 // VPMOVSXWD output type
755 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
756 // Extract 8 values from 128bit lane and sign extend
757 res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
758 }
759 return res;
760 }
761
762 //////////////////////////////////////////////////////////////////////////
763 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
764 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
765 /// platform, emulate it
766 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
767 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
768 Value *Builder::PERMD(Value* a, Value* idx)
769 {
770 Value* res;
771 // use avx2 permute instruction if available
772 if(JM()->mArch.AVX2())
773 {
774 // llvm 3.6.0 swapped the order of the args to vpermd
775 res = VPERMD(idx, a);
776 }
777 else
778 {
779 if (isa<Constant>(idx))
780 {
781 res = VSHUFFLE(a, a, idx);
782 }
783 else
784 {
785 res = VUNDEF_I();
786 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
787 {
788 Value* pIndex = VEXTRACT(idx, C(l));
789 Value* pVal = VEXTRACT(a, pIndex);
790 res = VINSERT(res, pVal, C(l));
791 }
792 }
793 }
794 return res;
795 }
796
797 //////////////////////////////////////////////////////////////////////////
798 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
799 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
800 /// platform, emulate it
801 /// @param a - 256bit SIMD lane(8x32bit) of float values.
802 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
803 Value *Builder::PERMPS(Value* a, Value* idx)
804 {
805 Value* res;
806 // use avx2 permute instruction if available
807 if (JM()->mArch.AVX2())
808 {
809 // llvm 3.6.0 swapped the order of the args to vpermd
810 res = VPERMPS(idx, a);
811 }
812 else
813 {
814 if (isa<Constant>(idx))
815 {
816 res = VSHUFFLE(a, a, idx);
817 }
818 else
819 {
820 res = VUNDEF_F();
821 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
822 {
823 Value* pIndex = VEXTRACT(idx, C(l));
824 Value* pVal = VEXTRACT(a, pIndex);
825 res = VINSERT(res, pVal, C(l));
826 }
827 }
828 }
829
830 return res;
831 }
832
833 //////////////////////////////////////////////////////////////////////////
834 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
835 /// in LLVM IR. If not supported on the underlying platform, emulate it
836 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
837 Value *Builder::CVTPH2PS(Value* a)
838 {
839 if (JM()->mArch.F16C())
840 {
841 return VCVTPH2PS(a);
842 }
843 else
844 {
845 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
846 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
847
848 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
849 {
850 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
851 }
852
853 Value* pResult = UndefValue::get(mSimdFP32Ty);
854 for (uint32_t i = 0; i < mVWidth; ++i)
855 {
856 Value* pSrc = VEXTRACT(a, C(i));
857 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
858 pResult = VINSERT(pResult, pConv, C(i));
859 }
860
861 return pResult;
862 }
863 }
864
865 //////////////////////////////////////////////////////////////////////////
866 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
867 /// in LLVM IR. If not supported on the underlying platform, emulate it
868 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
869 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
870 {
871 if (JM()->mArch.F16C())
872 {
873 return VCVTPS2PH(a, rounding);
874 }
875 else
876 {
877 // call scalar C function for now
878 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
879 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
880
881 if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
882 {
883 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
884 }
885
886 Value* pResult = UndefValue::get(mSimdInt16Ty);
887 for (uint32_t i = 0; i < mVWidth; ++i)
888 {
889 Value* pSrc = VEXTRACT(a, C(i));
890 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
891 pResult = VINSERT(pResult, pConv, C(i));
892 }
893
894 return pResult;
895 }
896 }
897
898 Value *Builder::PMAXSD(Value* a, Value* b)
899 {
900 if (JM()->mArch.AVX2())
901 {
902 return VPMAXSD(a, b);
903 }
904 else
905 {
906 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
907 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
908
909 // low 128
910 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
911 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
912 Value* resLo = CALL(pmaxsd, {aLo, bLo});
913
914 // high 128
915 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
916 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
917 Value* resHi = CALL(pmaxsd, {aHi, bHi});
918
919 // combine
920 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
921 result = VINSERTI128(result, resHi, C((uint8_t)1));
922
923 return result;
924 }
925 }
926
927 Value *Builder::PMINSD(Value* a, Value* b)
928 {
929 if (JM()->mArch.AVX2())
930 {
931 return VPMINSD(a, b);
932 }
933 else
934 {
935 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
936 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
937
938 // low 128
939 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
940 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
941 Value* resLo = CALL(pminsd, {aLo, bLo});
942
943 // high 128
944 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
945 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
946 Value* resHi = CALL(pminsd, {aHi, bHi});
947
948 // combine
949 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
950 result = VINSERTI128(result, resHi, C((uint8_t)1));
951
952 return result;
953 }
954 }
955
956 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
957 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
958 {
959 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
960 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
961 {
962 // ensure our mask is the correct type
963 mask = BITCAST(mask, mSimdFP32Ty);
964 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
965 }
966 else
967 {
968 // ensure our mask is the correct type
969 mask = BITCAST(mask, mSimdInt32Ty);
970 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
971 }
972 }
973
974 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
975 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
976 {
977 switch(info.bpp / info.numComps)
978 {
979 case 16:
980 {
981 Value* vGatherResult[2];
982 Value *vMask;
983
984 // TODO: vGatherMaskedVal
985 Value* vGatherMaskedVal = VIMMED1((float)0);
986
987 // always have at least one component out of x or y to fetch
988
989 // save mask as it is zero'd out after each gather
990 vMask = mask;
991
992 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
993 // e.g. result of first 8x32bit integer gather for 16bit components
994 // 256i - 0 1 2 3 4 5 6 7
995 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
996 //
997
998 // if we have at least one component out of x or y to fetch
999 if(info.numComps > 2)
1000 {
1001 // offset base to the next components(zw) in the vertex to gather
1002 pSrcBase = GEP(pSrcBase, C((char)4));
1003 vMask = mask;
1004
1005 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1006 // e.g. result of second 8x32bit integer gather for 16bit components
1007 // 256i - 0 1 2 3 4 5 6 7
1008 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1009 //
1010 }
1011 else
1012 {
1013 vGatherResult[1] = vGatherMaskedVal;
1014 }
1015
1016 // Shuffle gathered components into place, each row is a component
1017 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1018 }
1019 break;
1020 case 32:
1021 {
1022 // apply defaults
1023 for (uint32_t i = 0; i < 4; ++i)
1024 {
1025 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1026 }
1027
1028 for(uint32_t i = 0; i < info.numComps; i++)
1029 {
1030 uint32_t swizzleIndex = info.swizzle[i];
1031
1032 // save mask as it is zero'd out after each gather
1033 Value *vMask = mask;
1034
1035 // Gather a SIMD of components
1036 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1037
1038 // offset base to the next component to gather
1039 pSrcBase = GEP(pSrcBase, C((char)4));
1040 }
1041 }
1042 break;
1043 default:
1044 SWR_ASSERT(0, "Invalid float format");
1045 break;
1046 }
1047 }
1048
1049 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1050 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1051 {
1052 switch (info.bpp / info.numComps)
1053 {
1054 case 8:
1055 {
1056 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1057 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1058 // e.g. result of an 8x32bit integer gather for 8bit components
1059 // 256i - 0 1 2 3 4 5 6 7
1060 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1061
1062 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1063 }
1064 break;
1065 case 16:
1066 {
1067 Value* vGatherResult[2];
1068 Value *vMask;
1069
1070 // TODO: vGatherMaskedVal
1071 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1072
1073 // always have at least one component out of x or y to fetch
1074
1075 // save mask as it is zero'd out after each gather
1076 vMask = mask;
1077
1078 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1079 // e.g. result of first 8x32bit integer gather for 16bit components
1080 // 256i - 0 1 2 3 4 5 6 7
1081 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1082 //
1083
1084 // if we have at least one component out of x or y to fetch
1085 if(info.numComps > 2)
1086 {
1087 // offset base to the next components(zw) in the vertex to gather
1088 pSrcBase = GEP(pSrcBase, C((char)4));
1089 vMask = mask;
1090
1091 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1092 // e.g. result of second 8x32bit integer gather for 16bit components
1093 // 256i - 0 1 2 3 4 5 6 7
1094 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1095 //
1096 }
1097 else
1098 {
1099 vGatherResult[1] = vGatherMaskedVal;
1100 }
1101
1102 // Shuffle gathered components into place, each row is a component
1103 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1104
1105 }
1106 break;
1107 case 32:
1108 {
1109 // apply defaults
1110 for (uint32_t i = 0; i < 4; ++i)
1111 {
1112 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1113 }
1114
1115 for(uint32_t i = 0; i < info.numComps; i++)
1116 {
1117 uint32_t swizzleIndex = info.swizzle[i];
1118
1119 // save mask as it is zero'd out after each gather
1120 Value *vMask = mask;
1121
1122 // Gather a SIMD of components
1123 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1124
1125 // offset base to the next component to gather
1126 pSrcBase = GEP(pSrcBase, C((char)4));
1127 }
1128 }
1129 break;
1130 default:
1131 SWR_ASSERT(0, "unsupported format");
1132 break;
1133 }
1134 }
1135
1136 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1137 {
1138 // cast types
1139 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1140 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1141
1142 // input could either be float or int vector; do shuffle work in int
1143 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1144 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1145
1146 if(bPackedOutput)
1147 {
1148 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1149
1150 // shuffle mask
1151 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1152 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1153 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1154 // after pshufb: group components together in each 128bit lane
1155 // 256i - 0 1 2 3 4 5 6 7
1156 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1157
1158 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1159 // after PERMD: move and pack xy components into each 128bit lane
1160 // 256i - 0 1 2 3 4 5 6 7
1161 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1162
1163 // do the same for zw components
1164 Value* vi128ZW = nullptr;
1165 if(info.numComps > 2)
1166 {
1167 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1168 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1169 }
1170
1171 for(uint32_t i = 0; i < 4; i++)
1172 {
1173 uint32_t swizzleIndex = info.swizzle[i];
1174 // todo: fixed for packed
1175 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1176 if(i >= info.numComps)
1177 {
1178 // set the default component val
1179 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1180 continue;
1181 }
1182
1183 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1184 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1185 // if x or y, use vi128XY permute result, else use vi128ZW
1186 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1187
1188 // extract packed component 128 bit lanes
1189 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1190 }
1191
1192 }
1193 else
1194 {
1195 // pshufb masks for each component
1196 Value* vConstMask[2];
1197 // x/z shuffle mask
1198 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1199 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1200
1201 // y/w shuffle mask
1202 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1203 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1204
1205
1206 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1207 // apply defaults
1208 for (uint32_t i = 0; i < 4; ++i)
1209 {
1210 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1211 }
1212
1213 for(uint32_t i = 0; i < info.numComps; i++)
1214 {
1215 uint32_t swizzleIndex = info.swizzle[i];
1216
1217 // select correct constMask for x/z or y/w pshufb
1218 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1219 // if x or y, use vi128XY permute result, else use vi128ZW
1220 uint32_t selectedGather = (i < 2) ? 0 : 1;
1221
1222 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1223 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1224 // 256i - 0 1 2 3 4 5 6 7
1225 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1226 }
1227 }
1228 }
1229
1230 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1231 {
1232 // cast types
1233 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1234 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1235
1236 if(bPackedOutput)
1237 {
1238 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1239 // shuffle mask
1240 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1241 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1242 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1243 // after pshufb: group components together in each 128bit lane
1244 // 256i - 0 1 2 3 4 5 6 7
1245 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1246
1247 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1248 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1249 // 256i - 0 1 2 3 4 5 6 7
1250 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1251
1252 // do the same for zw components
1253 Value* vi128ZW = nullptr;
1254 if(info.numComps > 2)
1255 {
1256 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1257 }
1258
1259 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1260 for(uint32_t i = 0; i < 4; i++)
1261 {
1262 uint32_t swizzleIndex = info.swizzle[i];
1263 // todo: fix for packed
1264 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1265 if(i >= info.numComps)
1266 {
1267 // set the default component val
1268 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1269 continue;
1270 }
1271
1272 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1273 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1274 // if x or y, use vi128XY permute result, else use vi128ZW
1275 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1276
1277 // sign extend
1278 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1279 }
1280 }
1281 // else zero extend
1282 else{
1283 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1284 // apply defaults
1285 for (uint32_t i = 0; i < 4; ++i)
1286 {
1287 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1288 }
1289
1290 for(uint32_t i = 0; i < info.numComps; i++){
1291 uint32_t swizzleIndex = info.swizzle[i];
1292
1293 // pshufb masks for each component
1294 Value* vConstMask;
1295 switch(i)
1296 {
1297 case 0:
1298 // x shuffle mask
1299 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1300 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1301 break;
1302 case 1:
1303 // y shuffle mask
1304 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1305 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1306 break;
1307 case 2:
1308 // z shuffle mask
1309 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1310 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1311 break;
1312 case 3:
1313 // w shuffle mask
1314 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1315 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1316 break;
1317 default:
1318 vConstMask = nullptr;
1319 break;
1320 }
1321
1322 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1323 // after pshufb for x channel
1324 // 256i - 0 1 2 3 4 5 6 7
1325 // x000 x000 x000 x000 x000 x000 x000 x000
1326 }
1327 }
1328 }
1329
1330 //////////////////////////////////////////////////////////////////////////
1331 /// @brief emulates a scatter operation.
1332 /// @param pDst - pointer to destination
1333 /// @param vSrc - vector of src data to scatter
1334 /// @param vOffsets - vector of byte offsets from pDst
1335 /// @param vMask - mask of valid lanes
1336 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1337 {
1338 Value* pStack = STACKSAVE();
1339
1340 Type* pSrcTy = vSrc->getType()->getVectorElementType();
1341
1342 // allocate tmp stack for masked off lanes
1343 Value* vTmpPtr = ALLOCA(pSrcTy);
1344
1345 Value *mask = MASK(vMask);
1346 for (uint32_t i = 0; i < mVWidth; ++i)
1347 {
1348 Value *offset = VEXTRACT(vOffsets, C(i));
1349 // byte pointer to component
1350 Value *storeAddress = GEP(pDst, offset);
1351 storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
1352 Value *selMask = VEXTRACT(mask, C(i));
1353 Value *srcElem = VEXTRACT(vSrc, C(i));
1354 // switch in a safe address to load if we're trying to access a vertex
1355 Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
1356 STORE(srcElem, validAddress);
1357 }
1358
1359 STACKRESTORE(pStack);
1360 }
1361
1362 Value* Builder::VABSPS(Value* a)
1363 {
1364 Value* asInt = BITCAST(a, mSimdInt32Ty);
1365 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1366 return result;
1367 }
1368
1369 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1370 {
1371 Value *lowCmp = ICMP_SLT(src, low);
1372 Value *ret = SELECT(lowCmp, low, src);
1373
1374 Value *highCmp = ICMP_SGT(ret, high);
1375 ret = SELECT(highCmp, high, ret);
1376
1377 return ret;
1378 }
1379
1380 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1381 {
1382 Value *lowCmp = FCMP_OLT(src, low);
1383 Value *ret = SELECT(lowCmp, low, src);
1384
1385 Value *highCmp = FCMP_OGT(ret, high);
1386 ret = SELECT(highCmp, high, ret);
1387
1388 return ret;
1389 }
1390
1391 Value *Builder::FCLAMP(Value* src, float low, float high)
1392 {
1393 Value* result = VMAXPS(src, VIMMED1(low));
1394 result = VMINPS(result, VIMMED1(high));
1395
1396 return result;
1397 }
1398
1399 //////////////////////////////////////////////////////////////////////////
1400 /// @brief save/restore stack, providing ability to push/pop the stack and
1401 /// reduce overall stack requirements for temporary stack use
1402 Value* Builder::STACKSAVE()
1403 {
1404 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1405 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1406 return CALL(pfnStackSave);
1407 #else
1408 return CALLA(pfnStackSave);
1409 #endif
1410 }
1411
1412 void Builder::STACKRESTORE(Value* pSaved)
1413 {
1414 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1415 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1416 }
1417
1418 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1419 {
1420 Value* vOut;
1421 // use FMADs if available
1422 if(JM()->mArch.AVX2())
1423 {
1424 vOut = VFMADDPS(a, b, c);
1425 }
1426 else
1427 {
1428 vOut = FADD(FMUL(a, b), c);
1429 }
1430 return vOut;
1431 }
1432
1433 Value* Builder::POPCNT(Value* a)
1434 {
1435 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1436 return CALL(pCtPop, std::initializer_list<Value*>{a});
1437 }
1438
1439 //////////////////////////////////////////////////////////////////////////
1440 /// @brief C functions called by LLVM IR
1441 //////////////////////////////////////////////////////////////////////////
1442
1443 //////////////////////////////////////////////////////////////////////////
1444 /// @brief called in JIT code, inserted by PRINT
1445 /// output to both stdout and visual studio debug console
1446 void __cdecl CallPrint(const char* fmt, ...)
1447 {
1448 va_list args;
1449 va_start(args, fmt);
1450 vprintf(fmt, args);
1451
1452 #if defined( _WIN32 )
1453 char strBuf[1024];
1454 vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1455 OutputDebugString(strBuf);
1456 #endif
1457 }
1458
1459 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1460 {
1461 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1462 Function *func =
1463 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1464 Intrinsic::x86_avx_vextractf128_si_256);
1465 return CALL(func, {a, imm8});
1466 #else
1467 bool flag = !imm8->isZeroValue();
1468 SmallVector<Constant*,8> idx;
1469 for (unsigned i = 0; i < mVWidth / 2; i++) {
1470 idx.push_back(C(flag ? i + mVWidth / 2 : i));
1471 }
1472 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1473 #endif
1474 }
1475
1476 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1477 {
1478 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1479 Function *func =
1480 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1481 Intrinsic::x86_avx_vinsertf128_si_256);
1482 return CALL(func, {a, b, imm8});
1483 #else
1484 bool flag = !imm8->isZeroValue();
1485 SmallVector<Constant*,8> idx;
1486 for (unsigned i = 0; i < mVWidth; i++) {
1487 idx.push_back(C(i));
1488 }
1489 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1490
1491 SmallVector<Constant*,8> idx2;
1492 for (unsigned i = 0; i < mVWidth / 2; i++) {
1493 idx2.push_back(C(flag ? i : i + mVWidth));
1494 }
1495 for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1496 idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1497 }
1498 return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1499 #endif
1500 }
1501
1502 // rdtsc buckets macros
1503 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1504 {
1505 std::vector<Type*> args{
1506 PointerType::get(mInt32Ty, 0), // pBucketMgr
1507 mInt32Ty // id
1508 };
1509
1510 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1511 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1512 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1513 {
1514 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1515 }
1516
1517 CALL(pFunc, { pBucketMgr, pId });
1518 }
1519
1520 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1521 {
1522 std::vector<Type*> args{
1523 PointerType::get(mInt32Ty, 0), // pBucketMgr
1524 mInt32Ty // id
1525 };
1526
1527 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1528 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1529 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1530 {
1531 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1532 }
1533
1534 CALL(pFunc, { pBucketMgr, pId });
1535 }
1536