swr: [rasterizer jitter] Fix type mismatch on select args for SCATTERPS
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "llvm/Support/DynamicLibrary.h"
32
33 void __cdecl CallPrint(const char* fmt, ...);
34
35 //////////////////////////////////////////////////////////////////////////
36 /// @brief Convert an IEEE 754 32-bit single precision float to an
37 /// 16 bit float with 5 exponent bits and a variable
38 /// number of mantissa bits.
39 /// @param val - 32-bit float
40 /// @todo Maybe move this outside of this file into a header?
41 static uint16_t Convert32To16Float(float val)
42 {
43 uint32_t sign, exp, mant;
44 uint32_t roundBits;
45
46 // Extract the sign, exponent, and mantissa
47 uint32_t uf = *(uint32_t*)&val;
48 sign = (uf & 0x80000000) >> 31;
49 exp = (uf & 0x7F800000) >> 23;
50 mant = uf & 0x007FFFFF;
51
52 // Check for out of range
53 if (std::isnan(val))
54 {
55 exp = 0x1F;
56 mant = 0x200;
57 sign = 1; // set the sign bit for NANs
58 }
59 else if (std::isinf(val))
60 {
61 exp = 0x1f;
62 mant = 0x0;
63 }
64 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
65 {
66 exp = 0x1E;
67 mant = 0x3FF;
68 }
69 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
70 {
71 mant |= 0x00800000;
72 for (; exp <= 0x70; mant >>= 1, exp++)
73 ;
74 exp = 0;
75 mant = mant >> 13;
76 }
77 else if (exp < 0x66) // Too small to represent -> Zero
78 {
79 exp = 0;
80 mant = 0;
81 }
82 else
83 {
84 // Saves bits that will be shifted off for rounding
85 roundBits = mant & 0x1FFFu;
86 // convert exponent and mantissa to 16 bit format
87 exp = exp - 0x70;
88 mant = mant >> 13;
89
90 // Essentially RTZ, but round up if off by only 1 lsb
91 if (roundBits == 0x1FFFu)
92 {
93 mant++;
94 // check for overflow
95 if ((mant & 0xC00u) != 0)
96 exp++;
97 // make sure only the needed bits are used
98 mant &= 0x3FF;
99 }
100 }
101
102 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
103 return (uint16_t)tmpVal;
104 }
105
106 //////////////////////////////////////////////////////////////////////////
107 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
108 /// float
109 /// @param val - 16-bit float
110 /// @todo Maybe move this outside of this file into a header?
111 static float ConvertSmallFloatTo32(UINT val)
112 {
113 UINT result;
114 if ((val & 0x7fff) == 0)
115 {
116 result = ((uint32_t)(val & 0x8000)) << 16;
117 }
118 else if ((val & 0x7c00) == 0x7c00)
119 {
120 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
121 result |= ((uint32_t)val & 0x8000) << 16;
122 }
123 else
124 {
125 uint32_t sign = (val & 0x8000) << 16;
126 uint32_t mant = (val & 0x3ff) << 13;
127 uint32_t exp = (val >> 10) & 0x1f;
128 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
129 {
130 mant <<= 1;
131 while (mant < (0x400 << 13))
132 {
133 exp--;
134 mant <<= 1;
135 }
136 mant &= (0x3ff << 13);
137 }
138 exp = ((exp - 15 + 127) & 0xff) << 23;
139 result = sign | exp | mant;
140 }
141
142 return *(float*)&result;
143 }
144
145 Constant *Builder::C(bool i)
146 {
147 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
148 }
149
150 Constant *Builder::C(char i)
151 {
152 return ConstantInt::get(IRB()->getInt8Ty(), i);
153 }
154
155 Constant *Builder::C(uint8_t i)
156 {
157 return ConstantInt::get(IRB()->getInt8Ty(), i);
158 }
159
160 Constant *Builder::C(int i)
161 {
162 return ConstantInt::get(IRB()->getInt32Ty(), i);
163 }
164
165 Constant *Builder::C(int64_t i)
166 {
167 return ConstantInt::get(IRB()->getInt64Ty(), i);
168 }
169
170 Constant *Builder::C(uint16_t i)
171 {
172 return ConstantInt::get(mInt16Ty,i);
173 }
174
175 Constant *Builder::C(uint32_t i)
176 {
177 return ConstantInt::get(IRB()->getInt32Ty(), i);
178 }
179
180 Constant *Builder::C(float i)
181 {
182 return ConstantFP::get(IRB()->getFloatTy(), i);
183 }
184
185 Constant *Builder::PRED(bool pred)
186 {
187 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
188 }
189
190 Value *Builder::VIMMED1(int i)
191 {
192 return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
193 }
194
195 Value *Builder::VIMMED1(uint32_t i)
196 {
197 return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
198 }
199
200 Value *Builder::VIMMED1(float i)
201 {
202 return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i)));
203 }
204
205 Value *Builder::VIMMED1(bool i)
206 {
207 return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
208 }
209
210 Value *Builder::VUNDEF_IPTR()
211 {
212 return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth));
213 }
214
215 Value *Builder::VUNDEF_I()
216 {
217 return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth));
218 }
219
220 Value *Builder::VUNDEF(Type *ty, uint32_t size)
221 {
222 return UndefValue::get(VectorType::get(ty, size));
223 }
224
225 Value *Builder::VUNDEF_F()
226 {
227 return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth));
228 }
229
230 Value *Builder::VUNDEF(Type* t)
231 {
232 return UndefValue::get(VectorType::get(t, JM()->mVWidth));
233 }
234
235 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
236 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
237 {
238 return VINSERT(vec, val, C((int64_t)index));
239 }
240 #endif
241
242 Value *Builder::VBROADCAST(Value *src)
243 {
244 // check if src is already a vector
245 if (src->getType()->isVectorTy())
246 {
247 return src;
248 }
249
250 return VECTOR_SPLAT(JM()->mVWidth, src);
251 }
252
253 uint32_t Builder::IMMED(Value* v)
254 {
255 SWR_ASSERT(isa<ConstantInt>(v));
256 ConstantInt *pValConst = cast<ConstantInt>(v);
257 return pValConst->getZExtValue();
258 }
259
260 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
261 {
262 std::vector<Value*> indices;
263 for (auto i : indexList)
264 indices.push_back(i);
265 return GEPA(ptr, indices);
266 }
267
268 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
269 {
270 std::vector<Value*> indices;
271 for (auto i : indexList)
272 indices.push_back(C(i));
273 return GEPA(ptr, indices);
274 }
275
276 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
277 {
278 std::vector<Value*> valIndices;
279 for (auto i : indices)
280 valIndices.push_back(C(i));
281 return LOAD(GEPA(basePtr, valIndices), name);
282 }
283
284 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
285 {
286 std::vector<Value*> valIndices;
287 for (auto i : indices)
288 valIndices.push_back(i);
289 return LOAD(GEPA(basePtr, valIndices), name);
290 }
291
292 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
293 {
294 std::vector<Value*> valIndices;
295 for (auto i : indices)
296 valIndices.push_back(C(i));
297 return STORE(val, GEPA(basePtr, valIndices));
298 }
299
300 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
301 {
302 std::vector<Value*> valIndices;
303 for (auto i : indices)
304 valIndices.push_back(i);
305 return STORE(val, GEPA(basePtr, valIndices));
306 }
307
308 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
309 {
310 std::vector<Value*> args;
311 for (auto arg : argsList)
312 args.push_back(arg);
313 return CALLA(Callee, args);
314 }
315
316 Value *Builder::VRCP(Value *va)
317 {
318 return FDIV(VIMMED1(1.0f), va); // 1 / a
319 }
320
321 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
322 {
323 Value* vOut = FMADDPS(vA, vX, vC);
324 vOut = FMADDPS(vB, vY, vOut);
325 return vOut;
326 }
327
328 //////////////////////////////////////////////////////////////////////////
329 /// @brief Generate an i32 masked load operation in LLVM IR. If not
330 /// supported on the underlying platform, emulate it with float masked load
331 /// @param src - base address pointer for the load
332 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
333 Value *Builder::MASKLOADD(Value* src,Value* mask)
334 {
335 Value* vResult;
336 // use avx2 gather instruction is available
337 if(JM()->mArch.AVX2())
338 {
339 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
340 vResult = CALL(func,{src,mask});
341 }
342 else
343 {
344 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
345 Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth));
346 vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth));
347 }
348 return vResult;
349 }
350
351 //////////////////////////////////////////////////////////////////////////
352 /// @brief insert a JIT call to CallPrint
353 /// - outputs formatted string to both stdout and VS output window
354 /// - DEBUG builds only
355 /// Usage example:
356 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
357 /// where C(lane) creates a constant value to print, and pIndex is the Value*
358 /// result from a GEP, printing out the pointer to memory
359 /// @param printStr - constant string to print, which includes format specifiers
360 /// @param printArgs - initializer list of Value*'s to print to std out
361 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
362 {
363 // push the arguments to CallPrint into a vector
364 std::vector<Value*> printCallArgs;
365 // save room for the format string. we still need to modify it for vectors
366 printCallArgs.resize(1);
367
368 // search through the format string for special processing
369 size_t pos = 0;
370 std::string tempStr(printStr);
371 pos = tempStr.find('%', pos);
372 auto v = printArgs.begin();
373
374 while ((pos != std::string::npos) && (v != printArgs.end()))
375 {
376 Value* pArg = *v;
377 Type* pType = pArg->getType();
378
379 if (tempStr[pos + 1] == 't')
380 {
381 if (pType->isVectorTy())
382 {
383 Type* pContainedType = pType->getContainedType(0);
384
385 std::string vectorFormatStr;
386
387 if (pContainedType->isFloatTy())
388 {
389 tempStr[pos + 1] = 'f'; // Ensure its %f
390 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(0)), mDoubleTy));
391
392 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
393 {
394 vectorFormatStr += "%f ";
395 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), mDoubleTy));
396 }
397 }
398 else if (pContainedType->isIntegerTy())
399 {
400 tempStr[pos + 1] = 'd'; // Ensure its %d
401 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
402
403 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
404 {
405 vectorFormatStr += "%d ";
406 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
407 }
408 }
409 else
410 {
411 SWR_ASSERT(0, "Unsupported tyep");
412 }
413
414 tempStr.insert(pos, vectorFormatStr);
415 pos += vectorFormatStr.size();
416 }
417 else
418 {
419 if (pType->isFloatTy())
420 {
421 tempStr[pos + 1] = 'f'; // Ensure its %f
422 printCallArgs.push_back(FP_EXT(pArg, mDoubleTy));
423 }
424 else if (pType->isIntegerTy())
425 {
426 tempStr[pos + 1] = 'd'; // Ensure its %d
427 printCallArgs.push_back(pArg);
428 }
429 }
430 }
431 else if (toupper(tempStr[pos + 1]) == 'X')
432 {
433 if (pType->isVectorTy())
434 {
435 tempStr[pos] = '0';
436 tempStr.insert(pos + 1, "x%08");
437
438 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
439
440 std::string vectorFormatStr;
441 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
442 {
443 vectorFormatStr += "0x%08X ";
444 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
445 }
446
447 tempStr.insert(pos, vectorFormatStr);
448 pos += vectorFormatStr.size();
449 }
450 else
451 {
452 tempStr[pos] = '0';
453 tempStr.insert(pos + 1, "x%08");
454 printCallArgs.push_back(pArg);
455 pos += 3;
456 }
457 }
458 // for %f we need to cast float Values to doubles so that they print out correctly
459 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
460 {
461 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
462 pos++;
463 }
464 // add special handling for %f and %d format specifiers to make printing llvm vector types easier
465 else if (pType->isVectorTy())
466 {
467 Type* pContainedType = pType->getContainedType(0);
468
469 if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
470 {
471 uint32_t i = 0;
472 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
473 {
474 tempStr.insert(pos, std::string("%f "));
475 pos += 3;
476 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
477 }
478 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
479 }
480 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
481 {
482 uint32_t i = 0;
483 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
484 {
485 tempStr.insert(pos, std::string("%d "));
486 pos += 3;
487 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
488 }
489 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
490 }
491 else
492 {
493 /// not a supported vector to print
494 /// @todo pointer types too
495 SWR_ASSERT(0);
496 }
497 }
498 else
499 {
500 printCallArgs.push_back(pArg);
501 }
502
503 // advance to the next arguement
504 v++;
505 pos = tempStr.find('%', ++pos);
506 }
507
508 // create global variable constant string
509 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
510 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
511 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
512
513 // get a pointer to the first character in the constant string array
514 std::vector<Constant*> geplist{C(0),C(0)};
515 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
516 Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
517 #else
518 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
519 #endif
520
521 // insert the pointer to the format string in the argument vector
522 printCallArgs[0] = strGEP;
523
524 // get pointer to CallPrint function and insert decl into the module if needed
525 std::vector<Type*> args;
526 args.push_back(PointerType::get(mInt8Ty,0));
527 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
528 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
529
530 // if we haven't yet added the symbol to the symbol table
531 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
532 {
533 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
534 }
535
536 // insert a call to CallPrint
537 return CALLA(callPrintFn,printCallArgs);
538 }
539
540 //////////////////////////////////////////////////////////////////////////
541 /// @brief Wrapper around PRINT with initializer list.
542 CallInst* Builder::PRINT(const std::string &printStr)
543 {
544 return PRINT(printStr, {});
545 }
546
547 //////////////////////////////////////////////////////////////////////////
548 /// @brief Generate a masked gather operation in LLVM IR. If not
549 /// supported on the underlying platform, emulate it with loads
550 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
551 /// @param pBase - Int8* base VB address pointer value
552 /// @param vIndices - SIMD wide value of VB byte offsets
553 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
554 /// @param scale - value to scale indices by
555 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
556 {
557 Value* vGather;
558
559 // use avx2 gather instruction if available
560 if(JM()->mArch.AVX2())
561 {
562 // force mask to <N x float>, required by vgather
563 vMask = BITCAST(vMask, mSimdFP32Ty);
564 vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
565 }
566 else
567 {
568 Value* pStack = STACKSAVE();
569
570 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
571 Value* vSrcPtr = ALLOCA(vSrc->getType());
572 STORE(vSrc, vSrcPtr);
573
574 vGather = VUNDEF_F();
575 Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
576 Value *vOffsets = MUL(vIndices,vScaleVec);
577 Value *mask = MASK(vMask);
578 for(uint32_t i = 0; i < JM()->mVWidth; ++i)
579 {
580 // single component byte index
581 Value *offset = VEXTRACT(vOffsets,C(i));
582 // byte pointer to component
583 Value *loadAddress = GEP(pBase,offset);
584 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
585 // pointer to the value to load if we're masking off a component
586 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
587 Value *selMask = VEXTRACT(mask,C(i));
588 // switch in a safe address to load if we're trying to access a vertex
589 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
590 Value *val = LOAD(validAddress);
591 vGather = VINSERT(vGather,val,C(i));
592 }
593 STACKRESTORE(pStack);
594 }
595
596 return vGather;
597 }
598
599 //////////////////////////////////////////////////////////////////////////
600 /// @brief Generate a masked gather operation in LLVM IR. If not
601 /// supported on the underlying platform, emulate it with loads
602 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
603 /// @param pBase - Int8* base VB address pointer value
604 /// @param vIndices - SIMD wide value of VB byte offsets
605 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
606 /// @param scale - value to scale indices by
607 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
608 {
609 Value* vGather;
610
611 // use avx2 gather instruction if available
612 if(JM()->mArch.AVX2())
613 {
614 vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
615 }
616 else
617 {
618 Value* pStack = STACKSAVE();
619
620 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
621 Value* vSrcPtr = ALLOCA(vSrc->getType());
622 STORE(vSrc, vSrcPtr);
623
624 vGather = VUNDEF_I();
625 Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
626 Value *vOffsets = MUL(vIndices, vScaleVec);
627 Value *mask = MASK(vMask);
628 for(uint32_t i = 0; i < JM()->mVWidth; ++i)
629 {
630 // single component byte index
631 Value *offset = VEXTRACT(vOffsets, C(i));
632 // byte pointer to component
633 Value *loadAddress = GEP(pBase, offset);
634 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
635 // pointer to the value to load if we're masking off a component
636 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
637 Value *selMask = VEXTRACT(mask, C(i));
638 // switch in a safe address to load if we're trying to access a vertex
639 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
640 Value *val = LOAD(validAddress, C(0));
641 vGather = VINSERT(vGather, val, C(i));
642 }
643
644 STACKRESTORE(pStack);
645 }
646 return vGather;
647 }
648
649 //////////////////////////////////////////////////////////////////////////
650 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
651 Value* Builder::MASK(Value* vmask)
652 {
653 Value* src = BITCAST(vmask, mSimdInt32Ty);
654 return ICMP_SLT(src, VIMMED1(0));
655 }
656
657 //////////////////////////////////////////////////////////////////////////
658 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
659 Value* Builder::VMASK(Value* mask)
660 {
661 return S_EXT(mask, mSimdInt32Ty);
662 }
663
664 //////////////////////////////////////////////////////////////////////////
665 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
666 /// supported on the underlying platform, emulate it
667 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
668 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
669 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
670 /// 128bits of a, and vice versa for the upper lanes. If the mask
671 /// value is negative, '0' is inserted.
672 Value *Builder::PSHUFB(Value* a, Value* b)
673 {
674 Value* res;
675 // use avx2 pshufb instruction if available
676 if(JM()->mArch.AVX2())
677 {
678 res = VPSHUFB(a, b);
679 }
680 else
681 {
682 Constant* cB = dyn_cast<Constant>(b);
683 // number of 8 bit elements in b
684 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
685 // output vector
686 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
687
688 // insert an 8 bit value from the high and low lanes of a per loop iteration
689 numElms /= 2;
690 for(uint32_t i = 0; i < numElms; i++)
691 {
692 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
693 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
694
695 // extract values from constant mask
696 char valLow128bLane = (char)(cLow128b->getSExtValue());
697 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
698
699 Value* insertValLow128b;
700 Value* insertValHigh128b;
701
702 // if the mask value is negative, insert a '0' in the respective output position
703 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
704 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
705 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
706
707 vShuf = VINSERT(vShuf, insertValLow128b, i);
708 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
709 }
710 res = vShuf;
711 }
712 return res;
713 }
714
715 //////////////////////////////////////////////////////////////////////////
716 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
717 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
718 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
719 /// lower 8 values are used.
720 Value *Builder::PMOVSXBD(Value* a)
721 {
722 Value* res;
723 // use avx2 byte sign extend instruction if available
724 if(JM()->mArch.AVX2())
725 {
726 res = VPMOVSXBD(a);
727 }
728 else
729 {
730 // VPMOVSXBD output type
731 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
732 // Extract 8 values from 128bit lane and sign extend
733 res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
734 }
735 return res;
736 }
737
738 //////////////////////////////////////////////////////////////////////////
739 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
740 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
741 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
742 Value *Builder::PMOVSXWD(Value* a)
743 {
744 Value* res;
745 // use avx2 word sign extend if available
746 if(JM()->mArch.AVX2())
747 {
748 res = VPMOVSXWD(a);
749 }
750 else
751 {
752 // VPMOVSXWD output type
753 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
754 // Extract 8 values from 128bit lane and sign extend
755 res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
756 }
757 return res;
758 }
759
760 //////////////////////////////////////////////////////////////////////////
761 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
762 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
763 /// platform, emulate it
764 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
765 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
766 Value *Builder::PERMD(Value* a, Value* idx)
767 {
768 Value* res;
769 // use avx2 permute instruction if available
770 if(JM()->mArch.AVX2())
771 {
772 // llvm 3.6.0 swapped the order of the args to vpermd
773 res = VPERMD(idx, a);
774 }
775 else
776 {
777 res = VSHUFFLE(a, a, idx);
778 }
779 return res;
780 }
781
782 //////////////////////////////////////////////////////////////////////////
783 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
784 /// in LLVM IR. If not supported on the underlying platform, emulate it
785 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
786 Value *Builder::CVTPH2PS(Value* a)
787 {
788 if (JM()->mArch.F16C())
789 {
790 return VCVTPH2PS(a);
791 }
792 else
793 {
794 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
795 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
796
797 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
798 {
799 sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
800 }
801
802 Value* pResult = UndefValue::get(mSimdFP32Ty);
803 for (uint32_t i = 0; i < JM()->mVWidth; ++i)
804 {
805 Value* pSrc = VEXTRACT(a, C(i));
806 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
807 pResult = VINSERT(pResult, pConv, C(i));
808 }
809
810 return pResult;
811 }
812 }
813
814 //////////////////////////////////////////////////////////////////////////
815 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
816 /// in LLVM IR. If not supported on the underlying platform, emulate it
817 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
818 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
819 {
820 if (JM()->mArch.F16C())
821 {
822 return VCVTPS2PH(a, rounding);
823 }
824 else
825 {
826 // call scalar C function for now
827 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
828 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
829
830 if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
831 {
832 sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
833 }
834
835 Value* pResult = UndefValue::get(mSimdInt16Ty);
836 for (uint32_t i = 0; i < JM()->mVWidth; ++i)
837 {
838 Value* pSrc = VEXTRACT(a, C(i));
839 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
840 pResult = VINSERT(pResult, pConv, C(i));
841 }
842
843 return pResult;
844 }
845 }
846
847 Value *Builder::PMAXSD(Value* a, Value* b)
848 {
849 if (JM()->mArch.AVX2())
850 {
851 return VPMAXSD(a, b);
852 }
853 else
854 {
855 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
856 Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
857
858 // low 128
859 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
860 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
861 Value* resLo = CALL(pmaxsd, {aLo, bLo});
862
863 // high 128
864 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
865 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
866 Value* resHi = CALL(pmaxsd, {aHi, bHi});
867
868 // combine
869 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
870 result = VINSERTI128(result, resHi, C((uint8_t)1));
871
872 return result;
873 }
874 }
875
876 Value *Builder::PMINSD(Value* a, Value* b)
877 {
878 if (JM()->mArch.AVX2())
879 {
880 return VPMINSD(a, b);
881 }
882 else
883 {
884 // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
885 Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
886
887 // low 128
888 Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
889 Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
890 Value* resLo = CALL(pminsd, {aLo, bLo});
891
892 // high 128
893 Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
894 Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
895 Value* resHi = CALL(pminsd, {aHi, bHi});
896
897 // combine
898 Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
899 result = VINSERTI128(result, resHi, C((uint8_t)1));
900
901 return result;
902 }
903 }
904
905 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
906 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
907 {
908 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
909 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
910 {
911 // ensure our mask is the correct type
912 mask = BITCAST(mask, mSimdFP32Ty);
913 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
914 }
915 else
916 {
917 // ensure our mask is the correct type
918 mask = BITCAST(mask, mSimdInt32Ty);
919 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
920 }
921 }
922
923 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
924 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
925 {
926 switch(info.bpp / info.numComps)
927 {
928 case 16:
929 {
930 Value* vGatherResult[2];
931 Value *vMask;
932
933 // TODO: vGatherMaskedVal
934 Value* vGatherMaskedVal = VIMMED1((float)0);
935
936 // always have at least one component out of x or y to fetch
937
938 // save mask as it is zero'd out after each gather
939 vMask = mask;
940
941 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
942 // e.g. result of first 8x32bit integer gather for 16bit components
943 // 256i - 0 1 2 3 4 5 6 7
944 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
945 //
946
947 // if we have at least one component out of x or y to fetch
948 if(info.numComps > 2)
949 {
950 // offset base to the next components(zw) in the vertex to gather
951 pSrcBase = GEP(pSrcBase, C((char)4));
952 vMask = mask;
953
954 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
955 // e.g. result of second 8x32bit integer gather for 16bit components
956 // 256i - 0 1 2 3 4 5 6 7
957 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
958 //
959 }
960 else
961 {
962 vGatherResult[1] = vGatherMaskedVal;
963 }
964
965 // Shuffle gathered components into place, each row is a component
966 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
967 }
968 break;
969 case 32:
970 {
971 // apply defaults
972 for (uint32_t i = 0; i < 4; ++i)
973 {
974 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
975 }
976
977 for(uint32_t i = 0; i < info.numComps; i++)
978 {
979 uint32_t swizzleIndex = info.swizzle[i];
980
981 // save mask as it is zero'd out after each gather
982 Value *vMask = mask;
983
984 // Gather a SIMD of components
985 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
986
987 // offset base to the next component to gather
988 pSrcBase = GEP(pSrcBase, C((char)4));
989 }
990 }
991 break;
992 default:
993 SWR_ASSERT(0, "Invalid float format");
994 break;
995 }
996 }
997
998 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
999 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1000 {
1001 switch (info.bpp / info.numComps)
1002 {
1003 case 8:
1004 {
1005 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1006 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
1007 // e.g. result of an 8x32bit integer gather for 8bit components
1008 // 256i - 0 1 2 3 4 5 6 7
1009 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1010
1011 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1012 }
1013 break;
1014 case 16:
1015 {
1016 Value* vGatherResult[2];
1017 Value *vMask;
1018
1019 // TODO: vGatherMaskedVal
1020 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1021
1022 // always have at least one component out of x or y to fetch
1023
1024 // save mask as it is zero'd out after each gather
1025 vMask = mask;
1026
1027 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1028 // e.g. result of first 8x32bit integer gather for 16bit components
1029 // 256i - 0 1 2 3 4 5 6 7
1030 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1031 //
1032
1033 // if we have at least one component out of x or y to fetch
1034 if(info.numComps > 2)
1035 {
1036 // offset base to the next components(zw) in the vertex to gather
1037 pSrcBase = GEP(pSrcBase, C((char)4));
1038 vMask = mask;
1039
1040 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
1041 // e.g. result of second 8x32bit integer gather for 16bit components
1042 // 256i - 0 1 2 3 4 5 6 7
1043 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1044 //
1045 }
1046 else
1047 {
1048 vGatherResult[1] = vGatherMaskedVal;
1049 }
1050
1051 // Shuffle gathered components into place, each row is a component
1052 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1053
1054 }
1055 break;
1056 case 32:
1057 {
1058 // apply defaults
1059 for (uint32_t i = 0; i < 4; ++i)
1060 {
1061 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1062 }
1063
1064 for(uint32_t i = 0; i < info.numComps; i++)
1065 {
1066 uint32_t swizzleIndex = info.swizzle[i];
1067
1068 // save mask as it is zero'd out after each gather
1069 Value *vMask = mask;
1070
1071 // Gather a SIMD of components
1072 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
1073
1074 // offset base to the next component to gather
1075 pSrcBase = GEP(pSrcBase, C((char)4));
1076 }
1077 }
1078 break;
1079 default:
1080 SWR_ASSERT(0, "unsupported format");
1081 break;
1082 }
1083 }
1084
1085 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1086 {
1087 // cast types
1088 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
1089 Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
1090
1091 // input could either be float or int vector; do shuffle work in int
1092 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1093 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1094
1095 if(bPackedOutput)
1096 {
1097 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
1098
1099 // shuffle mask
1100 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1101 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1102 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1103 // after pshufb: group components together in each 128bit lane
1104 // 256i - 0 1 2 3 4 5 6 7
1105 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1106
1107 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1108 // after PERMD: move and pack xy components into each 128bit lane
1109 // 256i - 0 1 2 3 4 5 6 7
1110 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1111
1112 // do the same for zw components
1113 Value* vi128ZW = nullptr;
1114 if(info.numComps > 2)
1115 {
1116 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1117 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1118 }
1119
1120 for(uint32_t i = 0; i < 4; i++)
1121 {
1122 uint32_t swizzleIndex = info.swizzle[i];
1123 // todo: fixed for packed
1124 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1125 if(i >= info.numComps)
1126 {
1127 // set the default component val
1128 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1129 continue;
1130 }
1131
1132 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1133 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1134 // if x or y, use vi128XY permute result, else use vi128ZW
1135 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1136
1137 // extract packed component 128 bit lanes
1138 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1139 }
1140
1141 }
1142 else
1143 {
1144 // pshufb masks for each component
1145 Value* vConstMask[2];
1146 // x/z shuffle mask
1147 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1148 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1149
1150 // y/w shuffle mask
1151 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1152 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1153
1154
1155 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1156 // apply defaults
1157 for (uint32_t i = 0; i < 4; ++i)
1158 {
1159 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1160 }
1161
1162 for(uint32_t i = 0; i < info.numComps; i++)
1163 {
1164 uint32_t swizzleIndex = info.swizzle[i];
1165
1166 // select correct constMask for x/z or y/w pshufb
1167 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1168 // if x or y, use vi128XY permute result, else use vi128ZW
1169 uint32_t selectedGather = (i < 2) ? 0 : 1;
1170
1171 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1172 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1173 // 256i - 0 1 2 3 4 5 6 7
1174 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1175 }
1176 }
1177 }
1178
1179 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1180 {
1181 // cast types
1182 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
1183 Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
1184
1185 if(bPackedOutput)
1186 {
1187 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
1188 // shuffle mask
1189 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1190 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1191 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1192 // after pshufb: group components together in each 128bit lane
1193 // 256i - 0 1 2 3 4 5 6 7
1194 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1195
1196 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1197 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1198 // 256i - 0 1 2 3 4 5 6 7
1199 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1200
1201 // do the same for zw components
1202 Value* vi128ZW = nullptr;
1203 if(info.numComps > 2)
1204 {
1205 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1206 }
1207
1208 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1209 for(uint32_t i = 0; i < 4; i++)
1210 {
1211 uint32_t swizzleIndex = info.swizzle[i];
1212 // todo: fix for packed
1213 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1214 if(i >= info.numComps)
1215 {
1216 // set the default component val
1217 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1218 continue;
1219 }
1220
1221 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1222 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1223 // if x or y, use vi128XY permute result, else use vi128ZW
1224 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1225
1226 // sign extend
1227 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1228 }
1229 }
1230 // else zero extend
1231 else{
1232 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1233 // apply defaults
1234 for (uint32_t i = 0; i < 4; ++i)
1235 {
1236 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1237 }
1238
1239 for(uint32_t i = 0; i < info.numComps; i++){
1240 uint32_t swizzleIndex = info.swizzle[i];
1241
1242 // pshufb masks for each component
1243 Value* vConstMask;
1244 switch(i)
1245 {
1246 case 0:
1247 // x shuffle mask
1248 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1249 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1250 break;
1251 case 1:
1252 // y shuffle mask
1253 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1254 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1255 break;
1256 case 2:
1257 // z shuffle mask
1258 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1259 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1260 break;
1261 case 3:
1262 // w shuffle mask
1263 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1264 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1265 break;
1266 default:
1267 vConstMask = nullptr;
1268 break;
1269 }
1270
1271 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1272 // after pshufb for x channel
1273 // 256i - 0 1 2 3 4 5 6 7
1274 // x000 x000 x000 x000 x000 x000 x000 x000
1275 }
1276 }
1277 }
1278
1279 //////////////////////////////////////////////////////////////////////////
1280 /// @brief emulates a scatter operation.
1281 /// @param pDst - pointer to destination
1282 /// @param vSrc - vector of src data to scatter
1283 /// @param vOffsets - vector of byte offsets from pDst
1284 /// @param vMask - mask of valid lanes
1285 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1286 {
1287 Value* pStack = STACKSAVE();
1288
1289 Type* pSrcTy = vSrc->getType()->getVectorElementType();
1290
1291 // allocate tmp stack for masked off lanes
1292 Value* vTmpPtr = ALLOCA(pSrcTy);
1293
1294 Value *mask = MASK(vMask);
1295 for (uint32_t i = 0; i < JM()->mVWidth; ++i)
1296 {
1297 Value *offset = VEXTRACT(vOffsets, C(i));
1298 // byte pointer to component
1299 Value *storeAddress = GEP(pDst, offset);
1300 storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
1301 Value *selMask = VEXTRACT(mask, C(i));
1302 Value *srcElem = VEXTRACT(vSrc, C(i));
1303 // switch in a safe address to load if we're trying to access a vertex
1304 Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
1305 STORE(srcElem, validAddress);
1306 }
1307
1308 STACKRESTORE(pStack);
1309 }
1310
1311 Value* Builder::VABSPS(Value* a)
1312 {
1313 Value* asInt = BITCAST(a, mSimdInt32Ty);
1314 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1315 return result;
1316 }
1317
1318 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1319 {
1320 Value *lowCmp = ICMP_SLT(src, low);
1321 Value *ret = SELECT(lowCmp, low, src);
1322
1323 Value *highCmp = ICMP_SGT(ret, high);
1324 ret = SELECT(highCmp, high, ret);
1325
1326 return ret;
1327 }
1328
1329 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1330 {
1331 Value *lowCmp = FCMP_OLT(src, low);
1332 Value *ret = SELECT(lowCmp, low, src);
1333
1334 Value *highCmp = FCMP_OGT(ret, high);
1335 ret = SELECT(highCmp, high, ret);
1336
1337 return ret;
1338 }
1339
1340 Value *Builder::FCLAMP(Value* src, float low, float high)
1341 {
1342 Value* result = VMAXPS(src, VIMMED1(low));
1343 result = VMINPS(result, VIMMED1(high));
1344
1345 return result;
1346 }
1347
1348 //////////////////////////////////////////////////////////////////////////
1349 /// @brief save/restore stack, providing ability to push/pop the stack and
1350 /// reduce overall stack requirements for temporary stack use
1351 Value* Builder::STACKSAVE()
1352 {
1353 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1354 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1355 return CALL(pfnStackSave);
1356 #else
1357 return CALLA(pfnStackSave);
1358 #endif
1359 }
1360
1361 void Builder::STACKRESTORE(Value* pSaved)
1362 {
1363 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1364 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1365 }
1366
1367 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1368 {
1369 Value* vOut;
1370 // use FMADs if available
1371 if(JM()->mArch.AVX2())
1372 {
1373 vOut = VFMADDPS(a, b, c);
1374 }
1375 else
1376 {
1377 vOut = FADD(FMUL(a, b), c);
1378 }
1379 return vOut;
1380 }
1381
1382 Value* Builder::POPCNT(Value* a)
1383 {
1384 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1385 return CALL(pCtPop, std::initializer_list<Value*>{a});
1386 }
1387
1388 //////////////////////////////////////////////////////////////////////////
1389 /// @brief C functions called by LLVM IR
1390 //////////////////////////////////////////////////////////////////////////
1391
1392 //////////////////////////////////////////////////////////////////////////
1393 /// @brief called in JIT code, inserted by PRINT
1394 /// output to both stdout and visual studio debug console
1395 void __cdecl CallPrint(const char* fmt, ...)
1396 {
1397 va_list args;
1398 va_start(args, fmt);
1399 vprintf(fmt, args);
1400
1401 #if defined( _WIN32 )
1402 char strBuf[1024];
1403 vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1404 OutputDebugString(strBuf);
1405 #endif
1406 }
1407
1408 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1409 {
1410 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1411 Function *func =
1412 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1413 Intrinsic::x86_avx_vextractf128_si_256);
1414 return CALL(func, {a, imm8});
1415 #else
1416 bool flag = !imm8->isZeroValue();
1417 SmallVector<Constant*,8> idx;
1418 for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
1419 idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
1420 }
1421 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1422 #endif
1423 }
1424
1425 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1426 {
1427 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
1428 Function *func =
1429 Intrinsic::getDeclaration(JM()->mpCurrentModule,
1430 Intrinsic::x86_avx_vinsertf128_si_256);
1431 return CALL(func, {a, b, imm8});
1432 #else
1433 bool flag = !imm8->isZeroValue();
1434 SmallVector<Constant*,8> idx;
1435 for (unsigned i = 0; i < JM()->mVWidth; i++) {
1436 idx.push_back(C(i));
1437 }
1438 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1439
1440 SmallVector<Constant*,8> idx2;
1441 for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
1442 idx2.push_back(C(flag ? i : i + JM()->mVWidth));
1443 }
1444 for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) {
1445 idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
1446 }
1447 return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1448 #endif
1449 }