swr/rast: Simplify GATHER* jit builder api
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "common/rdtsc_buckets.h"
32
33 #include <cstdarg>
34
35 namespace SwrJit
36 {
37 void __cdecl CallPrint(const char* fmt, ...);
38
39 //////////////////////////////////////////////////////////////////////////
40 /// @brief Convert an IEEE 754 32-bit single precision float to an
41 /// 16 bit float with 5 exponent bits and a variable
42 /// number of mantissa bits.
43 /// @param val - 32-bit float
44 /// @todo Maybe move this outside of this file into a header?
45 static uint16_t ConvertFloat32ToFloat16(float val)
46 {
47 uint32_t sign, exp, mant;
48 uint32_t roundBits;
49
50 // Extract the sign, exponent, and mantissa
51 uint32_t uf = *(uint32_t*)&val;
52 sign = (uf & 0x80000000) >> 31;
53 exp = (uf & 0x7F800000) >> 23;
54 mant = uf & 0x007FFFFF;
55
56 // Check for out of range
57 if (std::isnan(val))
58 {
59 exp = 0x1F;
60 mant = 0x200;
61 sign = 1; // set the sign bit for NANs
62 }
63 else if (std::isinf(val))
64 {
65 exp = 0x1f;
66 mant = 0x0;
67 }
68 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
69 {
70 exp = 0x1E;
71 mant = 0x3FF;
72 }
73 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
74 {
75 mant |= 0x00800000;
76 for (; exp <= 0x70; mant >>= 1, exp++)
77 ;
78 exp = 0;
79 mant = mant >> 13;
80 }
81 else if (exp < 0x66) // Too small to represent -> Zero
82 {
83 exp = 0;
84 mant = 0;
85 }
86 else
87 {
88 // Saves bits that will be shifted off for rounding
89 roundBits = mant & 0x1FFFu;
90 // convert exponent and mantissa to 16 bit format
91 exp = exp - 0x70;
92 mant = mant >> 13;
93
94 // Essentially RTZ, but round up if off by only 1 lsb
95 if (roundBits == 0x1FFFu)
96 {
97 mant++;
98 // check for overflow
99 if ((mant & 0xC00u) != 0)
100 exp++;
101 // make sure only the needed bits are used
102 mant &= 0x3FF;
103 }
104 }
105
106 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
107 return (uint16_t)tmpVal;
108 }
109
110 //////////////////////////////////////////////////////////////////////////
111 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
112 /// float
113 /// @param val - 16-bit float
114 /// @todo Maybe move this outside of this file into a header?
115 static float ConvertFloat16ToFloat32(uint32_t val)
116 {
117 uint32_t result;
118 if ((val & 0x7fff) == 0)
119 {
120 result = ((uint32_t)(val & 0x8000)) << 16;
121 }
122 else if ((val & 0x7c00) == 0x7c00)
123 {
124 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
125 result |= ((uint32_t)val & 0x8000) << 16;
126 }
127 else
128 {
129 uint32_t sign = (val & 0x8000) << 16;
130 uint32_t mant = (val & 0x3ff) << 13;
131 uint32_t exp = (val >> 10) & 0x1f;
132 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
133 {
134 mant <<= 1;
135 while (mant < (0x400 << 13))
136 {
137 exp--;
138 mant <<= 1;
139 }
140 mant &= (0x3ff << 13);
141 }
142 exp = ((exp - 15 + 127) & 0xff) << 23;
143 result = sign | exp | mant;
144 }
145
146 return *(float*)&result;
147 }
148
149 Constant *Builder::C(bool i)
150 {
151 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
152 }
153
154 Constant *Builder::C(char i)
155 {
156 return ConstantInt::get(IRB()->getInt8Ty(), i);
157 }
158
159 Constant *Builder::C(uint8_t i)
160 {
161 return ConstantInt::get(IRB()->getInt8Ty(), i);
162 }
163
164 Constant *Builder::C(int i)
165 {
166 return ConstantInt::get(IRB()->getInt32Ty(), i);
167 }
168
169 Constant *Builder::C(int64_t i)
170 {
171 return ConstantInt::get(IRB()->getInt64Ty(), i);
172 }
173
174 Constant *Builder::C(uint16_t i)
175 {
176 return ConstantInt::get(mInt16Ty,i);
177 }
178
179 Constant *Builder::C(uint32_t i)
180 {
181 return ConstantInt::get(IRB()->getInt32Ty(), i);
182 }
183
184 Constant *Builder::C(float i)
185 {
186 return ConstantFP::get(IRB()->getFloatTy(), i);
187 }
188
189 Constant *Builder::PRED(bool pred)
190 {
191 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
192 }
193
194 Value *Builder::VIMMED1(int i)
195 {
196 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
197 }
198
199 Value *Builder::VIMMED1(uint32_t i)
200 {
201 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
202 }
203
204 Value *Builder::VIMMED1(float i)
205 {
206 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
207 }
208
209 Value *Builder::VIMMED1(bool i)
210 {
211 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
212 }
213
214 Value *Builder::VUNDEF_IPTR()
215 {
216 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
217 }
218
219 Value *Builder::VUNDEF_I()
220 {
221 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
222 }
223
224 Value *Builder::VUNDEF(Type *ty, uint32_t size)
225 {
226 return UndefValue::get(VectorType::get(ty, size));
227 }
228
229 Value *Builder::VUNDEF_F()
230 {
231 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
232 }
233
234 #if USE_SIMD16_BUILDER
235 Value *Builder::VUNDEF2_F()
236 {
237 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
238 }
239
240 #endif
241 Value *Builder::VUNDEF(Type* t)
242 {
243 return UndefValue::get(VectorType::get(t, mVWidth));
244 }
245
246 Value *Builder::VBROADCAST(Value *src)
247 {
248 // check if src is already a vector
249 if (src->getType()->isVectorTy())
250 {
251 return src;
252 }
253
254 return VECTOR_SPLAT(mVWidth, src);
255 }
256
257 uint32_t Builder::IMMED(Value* v)
258 {
259 SWR_ASSERT(isa<ConstantInt>(v));
260 ConstantInt *pValConst = cast<ConstantInt>(v);
261 return pValConst->getZExtValue();
262 }
263
264 int32_t Builder::S_IMMED(Value* v)
265 {
266 SWR_ASSERT(isa<ConstantInt>(v));
267 ConstantInt *pValConst = cast<ConstantInt>(v);
268 return pValConst->getSExtValue();
269 }
270
271 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
272 {
273 std::vector<Value*> indices;
274 for (auto i : indexList)
275 indices.push_back(i);
276 return GEPA(ptr, indices);
277 }
278
279 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
280 {
281 std::vector<Value*> indices;
282 for (auto i : indexList)
283 indices.push_back(C(i));
284 return GEPA(ptr, indices);
285 }
286
287 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
288 {
289 std::vector<Value*> indices;
290 for (auto i : indexList)
291 indices.push_back(i);
292 return IN_BOUNDS_GEP(ptr, indices);
293 }
294
295 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
296 {
297 std::vector<Value*> indices;
298 for (auto i : indexList)
299 indices.push_back(C(i));
300 return IN_BOUNDS_GEP(ptr, indices);
301 }
302
303 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
304 {
305 std::vector<Value*> valIndices;
306 for (auto i : indices)
307 valIndices.push_back(C(i));
308 return LOAD(GEPA(basePtr, valIndices), name);
309 }
310
311 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
312 {
313 std::vector<Value*> valIndices;
314 for (auto i : indices)
315 valIndices.push_back(i);
316 return LOAD(GEPA(basePtr, valIndices), name);
317 }
318
319 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
320 {
321 std::vector<Value*> valIndices;
322 for (auto i : indices)
323 valIndices.push_back(C(i));
324 return STORE(val, GEPA(basePtr, valIndices));
325 }
326
327 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
328 {
329 std::vector<Value*> valIndices;
330 for (auto i : indices)
331 valIndices.push_back(i);
332 return STORE(val, GEPA(basePtr, valIndices));
333 }
334
335 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
336 {
337 std::vector<Value*> args;
338 for (auto arg : argsList)
339 args.push_back(arg);
340 return CALLA(Callee, args);
341 }
342
343 CallInst *Builder::CALL(Value *Callee, Value* arg)
344 {
345 std::vector<Value*> args;
346 args.push_back(arg);
347 return CALLA(Callee, args);
348 }
349
350 CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
351 {
352 std::vector<Value*> args;
353 args.push_back(arg1);
354 args.push_back(arg2);
355 return CALLA(Callee, args);
356 }
357
358 CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
359 {
360 std::vector<Value*> args;
361 args.push_back(arg1);
362 args.push_back(arg2);
363 args.push_back(arg3);
364 return CALLA(Callee, args);
365 }
366
367 //////////////////////////////////////////////////////////////////////////
368 Value *Builder::DEBUGTRAP()
369 {
370 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
371 return CALL(func);
372 }
373
374 Value *Builder::VRCP(Value *va)
375 {
376 return FDIV(VIMMED1(1.0f), va); // 1 / a
377 }
378
379 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
380 {
381 Value* vOut = FMADDPS(vA, vX, vC);
382 vOut = FMADDPS(vB, vY, vOut);
383 return vOut;
384 }
385
386 //////////////////////////////////////////////////////////////////////////
387 /// @brief Generate an i32 masked load operation in LLVM IR. If not
388 /// supported on the underlying platform, emulate it with float masked load
389 /// @param src - base address pointer for the load
390 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
391 Value *Builder::MASKLOADD(Value* src,Value* mask)
392 {
393 Value* vResult;
394 // use avx2 gather instruction is available
395 if(JM()->mArch.AVX2())
396 {
397 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
398 vResult = CALL(func,{src,mask});
399 }
400 else
401 {
402 // maskload intrinsic expects integer mask operand in llvm >= 3.8
403 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
404 mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
405 #else
406 mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
407 #endif
408 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
409 vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
410 }
411 return vResult;
412 }
413
414 //////////////////////////////////////////////////////////////////////////
415 /// @brief insert a JIT call to CallPrint
416 /// - outputs formatted string to both stdout and VS output window
417 /// - DEBUG builds only
418 /// Usage example:
419 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
420 /// where C(lane) creates a constant value to print, and pIndex is the Value*
421 /// result from a GEP, printing out the pointer to memory
422 /// @param printStr - constant string to print, which includes format specifiers
423 /// @param printArgs - initializer list of Value*'s to print to std out
424 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
425 {
426 // push the arguments to CallPrint into a vector
427 std::vector<Value*> printCallArgs;
428 // save room for the format string. we still need to modify it for vectors
429 printCallArgs.resize(1);
430
431 // search through the format string for special processing
432 size_t pos = 0;
433 std::string tempStr(printStr);
434 pos = tempStr.find('%', pos);
435 auto v = printArgs.begin();
436
437 while ((pos != std::string::npos) && (v != printArgs.end()))
438 {
439 Value* pArg = *v;
440 Type* pType = pArg->getType();
441
442 if (pType->isVectorTy())
443 {
444 Type* pContainedType = pType->getContainedType(0);
445
446 if (toupper(tempStr[pos + 1]) == 'X')
447 {
448 tempStr[pos] = '0';
449 tempStr[pos + 1] = 'x';
450 tempStr.insert(pos + 2, "%08X ");
451 pos += 7;
452
453 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
454
455 std::string vectorFormatStr;
456 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
457 {
458 vectorFormatStr += "0x%08X ";
459 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
460 }
461
462 tempStr.insert(pos, vectorFormatStr);
463 pos += vectorFormatStr.size();
464 }
465 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
466 {
467 uint32_t i = 0;
468 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
469 {
470 tempStr.insert(pos, std::string("%f "));
471 pos += 3;
472 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
473 }
474 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
475 }
476 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
477 {
478 uint32_t i = 0;
479 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
480 {
481 tempStr.insert(pos, std::string("%d "));
482 pos += 3;
483 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
484 }
485 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
486 }
487 }
488 else
489 {
490 if (toupper(tempStr[pos + 1]) == 'X')
491 {
492 tempStr[pos] = '0';
493 tempStr.insert(pos + 1, "x%08");
494 printCallArgs.push_back(pArg);
495 pos += 3;
496 }
497 // for %f we need to cast float Values to doubles so that they print out correctly
498 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
499 {
500 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
501 pos++;
502 }
503 else
504 {
505 printCallArgs.push_back(pArg);
506 }
507 }
508
509 // advance to the next arguement
510 v++;
511 pos = tempStr.find('%', ++pos);
512 }
513
514 // create global variable constant string
515 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
516 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
517 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
518
519 // get a pointer to the first character in the constant string array
520 std::vector<Constant*> geplist{C(0),C(0)};
521 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
522
523 // insert the pointer to the format string in the argument vector
524 printCallArgs[0] = strGEP;
525
526 // get pointer to CallPrint function and insert decl into the module if needed
527 std::vector<Type*> args;
528 args.push_back(PointerType::get(mInt8Ty,0));
529 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
530 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
531
532 // if we haven't yet added the symbol to the symbol table
533 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
534 {
535 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
536 }
537
538 // insert a call to CallPrint
539 return CALLA(callPrintFn,printCallArgs);
540 }
541
542 //////////////////////////////////////////////////////////////////////////
543 /// @brief Wrapper around PRINT with initializer list.
544 CallInst* Builder::PRINT(const std::string &printStr)
545 {
546 return PRINT(printStr, {});
547 }
548
549 //////////////////////////////////////////////////////////////////////////
550 /// @brief Generate a masked gather operation in LLVM IR. If not
551 /// supported on the underlying platform, emulate it with loads
552 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
553 /// @param pBase - Int8* base VB address pointer value
554 /// @param vIndices - SIMD wide value of VB byte offsets
555 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
556 /// @param scale - value to scale indices by
557 Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
558 {
559 Value* vGather;
560
561 // use avx2 gather instruction if available
562 if(JM()->mArch.AVX2())
563 {
564 // force mask to <N x float>, required by vgather
565 vMask = BITCAST(vMask, mSimdFP32Ty);
566 vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,C(scale));
567 }
568 else
569 {
570 Value* pStack = STACKSAVE();
571
572 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
573 Value* vSrcPtr = ALLOCA(vSrc->getType());
574 STORE(vSrc, vSrcPtr);
575
576 vGather = VUNDEF_F();
577 Value *vScaleVec = VIMMED1((uint32_t)scale);
578 Value *vOffsets = MUL(vIndices,vScaleVec);
579 Value *mask = MASK(vMask);
580 for(uint32_t i = 0; i < mVWidth; ++i)
581 {
582 // single component byte index
583 Value *offset = VEXTRACT(vOffsets,C(i));
584 // byte pointer to component
585 Value *loadAddress = GEP(pBase,offset);
586 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
587 // pointer to the value to load if we're masking off a component
588 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
589 Value *selMask = VEXTRACT(mask,C(i));
590 // switch in a safe address to load if we're trying to access a vertex
591 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
592 Value *val = LOAD(validAddress);
593 vGather = VINSERT(vGather,val,C(i));
594 }
595 STACKRESTORE(pStack);
596 }
597
598 return vGather;
599 }
600
601 //////////////////////////////////////////////////////////////////////////
602 /// @brief Generate a masked gather operation in LLVM IR. If not
603 /// supported on the underlying platform, emulate it with loads
604 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
605 /// @param pBase - Int8* base VB address pointer value
606 /// @param vIndices - SIMD wide value of VB byte offsets
607 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
608 /// @param scale - value to scale indices by
609 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
610 {
611 Value* vGather;
612
613 // use avx2 gather instruction if available
614 if(JM()->mArch.AVX2())
615 {
616 vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
617 }
618 else
619 {
620 Value* pStack = STACKSAVE();
621
622 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
623 Value* vSrcPtr = ALLOCA(vSrc->getType());
624 STORE(vSrc, vSrcPtr);
625
626 vGather = VUNDEF_I();
627 Value *vScaleVec = VIMMED1((uint32_t)scale);
628 Value *vOffsets = MUL(vIndices, vScaleVec);
629 Value *mask = MASK(vMask);
630 for(uint32_t i = 0; i < mVWidth; ++i)
631 {
632 // single component byte index
633 Value *offset = VEXTRACT(vOffsets, C(i));
634 // byte pointer to component
635 Value *loadAddress = GEP(pBase, offset);
636 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
637 // pointer to the value to load if we're masking off a component
638 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
639 Value *selMask = VEXTRACT(mask, C(i));
640 // switch in a safe address to load if we're trying to access a vertex
641 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
642 Value *val = LOAD(validAddress, C(0));
643 vGather = VINSERT(vGather, val, C(i));
644 }
645
646 STACKRESTORE(pStack);
647 }
648 return vGather;
649 }
650
651 //////////////////////////////////////////////////////////////////////////
652 /// @brief Generate a masked gather operation in LLVM IR. If not
653 /// supported on the underlying platform, emulate it with loads
654 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
655 /// @param pBase - Int8* base VB address pointer value
656 /// @param vIndices - SIMD wide value of VB byte offsets
657 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
658 /// @param scale - value to scale indices by
659 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
660 {
661 Value* vGather;
662
663 // use avx2 gather instruction if available
664 if(JM()->mArch.AVX2())
665 {
666 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
667 }
668 else
669 {
670 Value* pStack = STACKSAVE();
671
672 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
673 Value* vSrcPtr = ALLOCA(vSrc->getType());
674 STORE(vSrc, vSrcPtr);
675
676 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
677 Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
678 Value *vOffsets = MUL(vIndices,vScaleVec);
679 Value *mask = MASK(vMask);
680 for(uint32_t i = 0; i < mVWidth/2; ++i)
681 {
682 // single component byte index
683 Value *offset = VEXTRACT(vOffsets,C(i));
684 // byte pointer to component
685 Value *loadAddress = GEP(pBase,offset);
686 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
687 // pointer to the value to load if we're masking off a component
688 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
689 Value *selMask = VEXTRACT(mask,C(i));
690 // switch in a safe address to load if we're trying to access a vertex
691 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
692 Value *val = LOAD(validAddress);
693 vGather = VINSERT(vGather,val,C(i));
694 }
695 STACKRESTORE(pStack);
696 }
697 return vGather;
698 }
699
700 #if USE_SIMD16_BUILDER
701 //////////////////////////////////////////////////////////////////////////
702 /// @brief
703 Value *Builder::EXTRACT(Value *a2, uint32_t imm)
704 {
705 const uint32_t i0 = (imm > 0) ? mVWidth : 0;
706
707 Value *result = VUNDEF_F();
708
709 for (uint32_t i = 0; i < mVWidth; i += 1)
710 {
711 Value *temp = VEXTRACT(a2, C(i0 + i));
712
713 result = VINSERT(result, temp, C(i));
714 }
715
716 return result;
717 }
718
719 //////////////////////////////////////////////////////////////////////////
720 /// @brief
721 Value *Builder::INSERT(Value *a2, Value * b, uint32_t imm)
722 {
723 const uint32_t i0 = (imm > 0) ? mVWidth : 0;
724
725 Value *result = BITCAST(a2, mSimd2FP32Ty);
726
727 for (uint32_t i = 0; i < mVWidth; i += 1)
728 {
729 #if 1
730 if (!b->getType()->getScalarType()->isFloatTy())
731 {
732 b = BITCAST(b, mSimdFP32Ty);
733 }
734
735 #endif
736 Value *temp = VEXTRACT(b, C(i));
737
738 result = VINSERT(result, temp, C(i0 + i));
739 }
740
741 return result;
742 }
743
744 #endif
745 //////////////////////////////////////////////////////////////////////////
746 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
747 Value* Builder::MASK(Value* vmask)
748 {
749 Value* src = BITCAST(vmask, mSimdInt32Ty);
750 return ICMP_SLT(src, VIMMED1(0));
751 }
752
753 //////////////////////////////////////////////////////////////////////////
754 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
755 Value* Builder::VMASK(Value* mask)
756 {
757 return S_EXT(mask, mSimdInt32Ty);
758 }
759
760 //////////////////////////////////////////////////////////////////////////
761 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
762 /// supported on the underlying platform, emulate it
763 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
764 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
765 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
766 /// 128bits of a, and vice versa for the upper lanes. If the mask
767 /// value is negative, '0' is inserted.
768 Value *Builder::PSHUFB(Value* a, Value* b)
769 {
770 Value* res;
771 // use avx2 pshufb instruction if available
772 if(JM()->mArch.AVX2())
773 {
774 res = VPSHUFB(a, b);
775 }
776 else
777 {
778 Constant* cB = dyn_cast<Constant>(b);
779 // number of 8 bit elements in b
780 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
781 // output vector
782 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
783
784 // insert an 8 bit value from the high and low lanes of a per loop iteration
785 numElms /= 2;
786 for(uint32_t i = 0; i < numElms; i++)
787 {
788 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
789 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
790
791 // extract values from constant mask
792 char valLow128bLane = (char)(cLow128b->getSExtValue());
793 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
794
795 Value* insertValLow128b;
796 Value* insertValHigh128b;
797
798 // if the mask value is negative, insert a '0' in the respective output position
799 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
800 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
801 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
802
803 vShuf = VINSERT(vShuf, insertValLow128b, i);
804 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
805 }
806 res = vShuf;
807 }
808 return res;
809 }
810
811 //////////////////////////////////////////////////////////////////////////
812 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
813 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
814 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
815 /// lower 8 values are used.
816 Value *Builder::PMOVSXBD(Value* a)
817 {
818 // VPMOVSXBD output type
819 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
820 // Extract 8 values from 128bit lane and sign extend
821 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
822 }
823
824 //////////////////////////////////////////////////////////////////////////
825 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
826 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
827 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
828 Value *Builder::PMOVSXWD(Value* a)
829 {
830 // VPMOVSXWD output type
831 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
832 // Extract 8 values from 128bit lane and sign extend
833 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
834 }
835
836 //////////////////////////////////////////////////////////////////////////
837 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
838 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
839 /// platform, emulate it
840 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
841 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
842 Value *Builder::PERMD(Value* a, Value* idx)
843 {
844 Value* res;
845 // use avx2 permute instruction if available
846 if(JM()->mArch.AVX2())
847 {
848 res = VPERMD(a, idx);
849 }
850 else
851 {
852 if (isa<Constant>(idx))
853 {
854 res = VSHUFFLE(a, a, idx);
855 }
856 else
857 {
858 res = VUNDEF_I();
859 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
860 {
861 Value* pIndex = VEXTRACT(idx, C(l));
862 Value* pVal = VEXTRACT(a, pIndex);
863 res = VINSERT(res, pVal, C(l));
864 }
865 }
866 }
867 return res;
868 }
869
870 //////////////////////////////////////////////////////////////////////////
871 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
872 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
873 /// platform, emulate it
874 /// @param a - 256bit SIMD lane(8x32bit) of float values.
875 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
876 Value *Builder::PERMPS(Value* a, Value* idx)
877 {
878 Value* res;
879 // use avx2 permute instruction if available
880 if (JM()->mArch.AVX2())
881 {
882 // llvm 3.6.0 swapped the order of the args to vpermd
883 res = VPERMPS(idx, a);
884 }
885 else
886 {
887 if (isa<Constant>(idx))
888 {
889 res = VSHUFFLE(a, a, idx);
890 }
891 else
892 {
893 res = VUNDEF_F();
894 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
895 {
896 Value* pIndex = VEXTRACT(idx, C(l));
897 Value* pVal = VEXTRACT(a, pIndex);
898 res = VINSERT(res, pVal, C(l));
899 }
900 }
901 }
902
903 return res;
904 }
905
906 //////////////////////////////////////////////////////////////////////////
907 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
908 /// in LLVM IR. If not supported on the underlying platform, emulate it
909 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
910 Value *Builder::CVTPH2PS(Value* a)
911 {
912 if (JM()->mArch.F16C())
913 {
914 return VCVTPH2PS(a);
915 }
916 else
917 {
918 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
919 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
920
921 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
922 {
923 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
924 }
925
926 Value* pResult = UndefValue::get(mSimdFP32Ty);
927 for (uint32_t i = 0; i < mVWidth; ++i)
928 {
929 Value* pSrc = VEXTRACT(a, C(i));
930 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
931 pResult = VINSERT(pResult, pConv, C(i));
932 }
933
934 return pResult;
935 }
936 }
937
938 //////////////////////////////////////////////////////////////////////////
939 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
940 /// in LLVM IR. If not supported on the underlying platform, emulate it
941 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
942 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
943 {
944 if (JM()->mArch.F16C())
945 {
946 return VCVTPS2PH(a, rounding);
947 }
948 else
949 {
950 // call scalar C function for now
951 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
952 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
953
954 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
955 {
956 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
957 }
958
959 Value* pResult = UndefValue::get(mSimdInt16Ty);
960 for (uint32_t i = 0; i < mVWidth; ++i)
961 {
962 Value* pSrc = VEXTRACT(a, C(i));
963 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
964 pResult = VINSERT(pResult, pConv, C(i));
965 }
966
967 return pResult;
968 }
969 }
970
971 Value *Builder::PMAXSD(Value* a, Value* b)
972 {
973 Value* cmp = ICMP_SGT(a, b);
974 return SELECT(cmp, a, b);
975 }
976
977 Value *Builder::PMINSD(Value* a, Value* b)
978 {
979 Value* cmp = ICMP_SLT(a, b);
980 return SELECT(cmp, a, b);
981 }
982
983 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
984 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
985 {
986 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
987 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
988 {
989 // ensure our mask is the correct type
990 mask = BITCAST(mask, mSimdFP32Ty);
991 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
992 }
993 else
994 {
995 // ensure our mask is the correct type
996 mask = BITCAST(mask, mSimdInt32Ty);
997 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
998 }
999 }
1000
1001 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1002 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1003 {
1004 switch(info.bpp / info.numComps)
1005 {
1006 case 16:
1007 {
1008 Value* vGatherResult[2];
1009 Value *vMask;
1010
1011 // TODO: vGatherMaskedVal
1012 Value* vGatherMaskedVal = VIMMED1((float)0);
1013
1014 // always have at least one component out of x or y to fetch
1015
1016 // save mask as it is zero'd out after each gather
1017 vMask = mask;
1018
1019 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1020 // e.g. result of first 8x32bit integer gather for 16bit components
1021 // 256i - 0 1 2 3 4 5 6 7
1022 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1023 //
1024
1025 // if we have at least one component out of x or y to fetch
1026 if(info.numComps > 2)
1027 {
1028 // offset base to the next components(zw) in the vertex to gather
1029 pSrcBase = GEP(pSrcBase, C((char)4));
1030 vMask = mask;
1031
1032 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1033 // e.g. result of second 8x32bit integer gather for 16bit components
1034 // 256i - 0 1 2 3 4 5 6 7
1035 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1036 //
1037 }
1038 else
1039 {
1040 vGatherResult[1] = vGatherMaskedVal;
1041 }
1042
1043 // Shuffle gathered components into place, each row is a component
1044 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1045 }
1046 break;
1047 case 32:
1048 {
1049 // apply defaults
1050 for (uint32_t i = 0; i < 4; ++i)
1051 {
1052 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1053 }
1054
1055 for(uint32_t i = 0; i < info.numComps; i++)
1056 {
1057 uint32_t swizzleIndex = info.swizzle[i];
1058
1059 // save mask as it is zero'd out after each gather
1060 Value *vMask = mask;
1061
1062 // Gather a SIMD of components
1063 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1064
1065 // offset base to the next component to gather
1066 pSrcBase = GEP(pSrcBase, C((char)4));
1067 }
1068 }
1069 break;
1070 default:
1071 SWR_INVALID("Invalid float format");
1072 break;
1073 }
1074 }
1075
1076 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1077 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1078 {
1079 switch (info.bpp / info.numComps)
1080 {
1081 case 8:
1082 {
1083 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1084 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask);
1085 // e.g. result of an 8x32bit integer gather for 8bit components
1086 // 256i - 0 1 2 3 4 5 6 7
1087 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1088
1089 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1090 }
1091 break;
1092 case 16:
1093 {
1094 Value* vGatherResult[2];
1095 Value *vMask;
1096
1097 // TODO: vGatherMaskedVal
1098 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1099
1100 // always have at least one component out of x or y to fetch
1101
1102 // save mask as it is zero'd out after each gather
1103 vMask = mask;
1104
1105 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1106 // e.g. result of first 8x32bit integer gather for 16bit components
1107 // 256i - 0 1 2 3 4 5 6 7
1108 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1109 //
1110
1111 // if we have at least one component out of x or y to fetch
1112 if(info.numComps > 2)
1113 {
1114 // offset base to the next components(zw) in the vertex to gather
1115 pSrcBase = GEP(pSrcBase, C((char)4));
1116 vMask = mask;
1117
1118 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1119 // e.g. result of second 8x32bit integer gather for 16bit components
1120 // 256i - 0 1 2 3 4 5 6 7
1121 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1122 //
1123 }
1124 else
1125 {
1126 vGatherResult[1] = vGatherMaskedVal;
1127 }
1128
1129 // Shuffle gathered components into place, each row is a component
1130 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1131
1132 }
1133 break;
1134 case 32:
1135 {
1136 // apply defaults
1137 for (uint32_t i = 0; i < 4; ++i)
1138 {
1139 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1140 }
1141
1142 for(uint32_t i = 0; i < info.numComps; i++)
1143 {
1144 uint32_t swizzleIndex = info.swizzle[i];
1145
1146 // save mask as it is zero'd out after each gather
1147 Value *vMask = mask;
1148
1149 // Gather a SIMD of components
1150 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1151
1152 // offset base to the next component to gather
1153 pSrcBase = GEP(pSrcBase, C((char)4));
1154 }
1155 }
1156 break;
1157 default:
1158 SWR_INVALID("unsupported format");
1159 break;
1160 }
1161 }
1162
1163 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1164 {
1165 // cast types
1166 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1167 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1168
1169 // input could either be float or int vector; do shuffle work in int
1170 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1171 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1172
1173 if(bPackedOutput)
1174 {
1175 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1176
1177 // shuffle mask
1178 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1179 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1180 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1181 // after pshufb: group components together in each 128bit lane
1182 // 256i - 0 1 2 3 4 5 6 7
1183 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1184
1185 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1186 // after PERMD: move and pack xy components into each 128bit lane
1187 // 256i - 0 1 2 3 4 5 6 7
1188 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1189
1190 // do the same for zw components
1191 Value* vi128ZW = nullptr;
1192 if(info.numComps > 2)
1193 {
1194 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1195 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1196 }
1197
1198 for(uint32_t i = 0; i < 4; i++)
1199 {
1200 uint32_t swizzleIndex = info.swizzle[i];
1201 // todo: fixed for packed
1202 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1203 if(i >= info.numComps)
1204 {
1205 // set the default component val
1206 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1207 continue;
1208 }
1209
1210 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1211 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1212 // if x or y, use vi128XY permute result, else use vi128ZW
1213 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1214
1215 // extract packed component 128 bit lanes
1216 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1217 }
1218
1219 }
1220 else
1221 {
1222 // pshufb masks for each component
1223 Value* vConstMask[2];
1224 // x/z shuffle mask
1225 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1226 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1227
1228 // y/w shuffle mask
1229 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1230 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1231
1232
1233 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1234 // apply defaults
1235 for (uint32_t i = 0; i < 4; ++i)
1236 {
1237 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1238 }
1239
1240 for(uint32_t i = 0; i < info.numComps; i++)
1241 {
1242 uint32_t swizzleIndex = info.swizzle[i];
1243
1244 // select correct constMask for x/z or y/w pshufb
1245 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1246 // if x or y, use vi128XY permute result, else use vi128ZW
1247 uint32_t selectedGather = (i < 2) ? 0 : 1;
1248
1249 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1250 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1251 // 256i - 0 1 2 3 4 5 6 7
1252 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1253 }
1254 }
1255 }
1256
1257 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1258 {
1259 // cast types
1260 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1261 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1262
1263 if(bPackedOutput)
1264 {
1265 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1266 // shuffle mask
1267 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1268 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1269 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1270 // after pshufb: group components together in each 128bit lane
1271 // 256i - 0 1 2 3 4 5 6 7
1272 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1273
1274 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1275 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1276 // 256i - 0 1 2 3 4 5 6 7
1277 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1278
1279 // do the same for zw components
1280 Value* vi128ZW = nullptr;
1281 if(info.numComps > 2)
1282 {
1283 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1284 }
1285
1286 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1287 for(uint32_t i = 0; i < 4; i++)
1288 {
1289 uint32_t swizzleIndex = info.swizzle[i];
1290 // todo: fix for packed
1291 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1292 if(i >= info.numComps)
1293 {
1294 // set the default component val
1295 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1296 continue;
1297 }
1298
1299 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1300 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1301 // if x or y, use vi128XY permute result, else use vi128ZW
1302 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1303
1304 // sign extend
1305 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1306 }
1307 }
1308 // else zero extend
1309 else{
1310 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1311 // apply defaults
1312 for (uint32_t i = 0; i < 4; ++i)
1313 {
1314 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1315 }
1316
1317 for(uint32_t i = 0; i < info.numComps; i++){
1318 uint32_t swizzleIndex = info.swizzle[i];
1319
1320 // pshufb masks for each component
1321 Value* vConstMask;
1322 switch(i)
1323 {
1324 case 0:
1325 // x shuffle mask
1326 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1327 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1328 break;
1329 case 1:
1330 // y shuffle mask
1331 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1332 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1333 break;
1334 case 2:
1335 // z shuffle mask
1336 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1337 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1338 break;
1339 case 3:
1340 // w shuffle mask
1341 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1342 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1343 break;
1344 default:
1345 vConstMask = nullptr;
1346 break;
1347 }
1348
1349 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1350 // after pshufb for x channel
1351 // 256i - 0 1 2 3 4 5 6 7
1352 // x000 x000 x000 x000 x000 x000 x000 x000
1353 }
1354 }
1355 }
1356
1357 // Helper function to create alloca in entry block of function
1358 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1359 {
1360 auto saveIP = IRB()->saveIP();
1361 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1362 pFunc->getEntryBlock().begin());
1363 Value* pAlloca = ALLOCA(pType);
1364 if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1365 return pAlloca;
1366 }
1367
1368 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
1369 {
1370 auto saveIP = IRB()->saveIP();
1371 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1372 pFunc->getEntryBlock().begin());
1373 Value* pAlloca = ALLOCA(pType, pArraySize);
1374 if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1375 return pAlloca;
1376 }
1377
1378 //////////////////////////////////////////////////////////////////////////
1379 /// @brief emulates a scatter operation.
1380 /// @param pDst - pointer to destination
1381 /// @param vSrc - vector of src data to scatter
1382 /// @param vOffsets - vector of byte offsets from pDst
1383 /// @param vMask - mask of valid lanes
1384 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1385 {
1386 /* Scatter algorithm
1387
1388 while(Index = BitScanForward(mask))
1389 srcElem = srcVector[Index]
1390 offsetElem = offsetVector[Index]
1391 *(pDst + offsetElem) = srcElem
1392 Update mask (&= ~(1<<Index)
1393
1394 */
1395
1396 BasicBlock* pCurBB = IRB()->GetInsertBlock();
1397 Function* pFunc = pCurBB->getParent();
1398 Type* pSrcTy = vSrc->getType()->getVectorElementType();
1399
1400 // Store vectors on stack
1401 if (pScatterStackSrc == nullptr)
1402 {
1403 // Save off stack allocations and reuse per scatter. Significantly reduces stack
1404 // requirements for shaders with a lot of scatters.
1405 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1406 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1407 }
1408
1409 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1410 Value* pOffsetsArrayPtr = pScatterStackOffsets;
1411 STORE(vSrc, pSrcArrayPtr);
1412 STORE(vOffsets, pOffsetsArrayPtr);
1413
1414 // Cast to pointers for random access
1415 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1416 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1417
1418 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1419
1420 // Get cttz function
1421 Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1422
1423 // Setup loop basic block
1424 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1425
1426 // compute first set bit
1427 Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1428
1429 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1430
1431 // Split current block
1432 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1433
1434 // Remove unconditional jump created by splitBasicBlock
1435 pCurBB->getTerminator()->eraseFromParent();
1436
1437 // Add terminator to end of original block
1438 IRB()->SetInsertPoint(pCurBB);
1439
1440 // Add conditional branch
1441 COND_BR(pIsUndef, pPostLoop, pLoop);
1442
1443 // Add loop basic block contents
1444 IRB()->SetInsertPoint(pLoop);
1445 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1446 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1447
1448 pIndexPhi->addIncoming(pIndex, pCurBB);
1449 pMaskPhi->addIncoming(pMask, pCurBB);
1450
1451 // Extract elements for this index
1452 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1453 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1454
1455 // GEP to this offset in dst
1456 Value* pCurDst = GEP(pDst, pOffsetElem);
1457 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1458 STORE(pSrcElem, pCurDst);
1459
1460 // Update the mask
1461 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1462
1463 // Terminator
1464 Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1465
1466 pIsUndef = ICMP_EQ(pNewIndex, C(32));
1467 COND_BR(pIsUndef, pPostLoop, pLoop);
1468
1469 // Update phi edges
1470 pIndexPhi->addIncoming(pNewIndex, pLoop);
1471 pMaskPhi->addIncoming(pNewMask, pLoop);
1472
1473 // Move builder to beginning of post loop
1474 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1475 }
1476
1477 Value* Builder::VABSPS(Value* a)
1478 {
1479 Value* asInt = BITCAST(a, mSimdInt32Ty);
1480 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1481 return result;
1482 }
1483
1484 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1485 {
1486 Value *lowCmp = ICMP_SLT(src, low);
1487 Value *ret = SELECT(lowCmp, low, src);
1488
1489 Value *highCmp = ICMP_SGT(ret, high);
1490 ret = SELECT(highCmp, high, ret);
1491
1492 return ret;
1493 }
1494
1495 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1496 {
1497 Value *lowCmp = FCMP_OLT(src, low);
1498 Value *ret = SELECT(lowCmp, low, src);
1499
1500 Value *highCmp = FCMP_OGT(ret, high);
1501 ret = SELECT(highCmp, high, ret);
1502
1503 return ret;
1504 }
1505
1506 Value *Builder::FCLAMP(Value* src, float low, float high)
1507 {
1508 Value* result = VMAXPS(src, VIMMED1(low));
1509 result = VMINPS(result, VIMMED1(high));
1510
1511 return result;
1512 }
1513
1514 //////////////////////////////////////////////////////////////////////////
1515 /// @brief save/restore stack, providing ability to push/pop the stack and
1516 /// reduce overall stack requirements for temporary stack use
1517 Value* Builder::STACKSAVE()
1518 {
1519 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1520 return CALLA(pfnStackSave);
1521 }
1522
1523 void Builder::STACKRESTORE(Value* pSaved)
1524 {
1525 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1526 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1527 }
1528
1529 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1530 {
1531 Value* vOut;
1532 // use FMADs if available
1533 if(JM()->mArch.AVX2())
1534 {
1535 vOut = VFMADDPS(a, b, c);
1536 }
1537 else
1538 {
1539 vOut = FADD(FMUL(a, b), c);
1540 }
1541 return vOut;
1542 }
1543
1544 Value* Builder::POPCNT(Value* a)
1545 {
1546 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1547 return CALL(pCtPop, std::initializer_list<Value*>{a});
1548 }
1549
1550 //////////////////////////////////////////////////////////////////////////
1551 /// @brief C functions called by LLVM IR
1552 //////////////////////////////////////////////////////////////////////////
1553
1554 //////////////////////////////////////////////////////////////////////////
1555 /// @brief called in JIT code, inserted by PRINT
1556 /// output to both stdout and visual studio debug console
1557 void __cdecl CallPrint(const char* fmt, ...)
1558 {
1559 va_list args;
1560 va_start(args, fmt);
1561 vprintf(fmt, args);
1562
1563 #if defined( _WIN32 )
1564 char strBuf[1024];
1565 vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1566 OutputDebugStringA(strBuf);
1567 #endif
1568
1569 va_end(args);
1570 }
1571
1572 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1573 {
1574 bool flag = !imm8->isZeroValue();
1575 SmallVector<Constant*,8> idx;
1576 for (unsigned i = 0; i < mVWidth / 2; i++) {
1577 idx.push_back(C(flag ? i + mVWidth / 2 : i));
1578 }
1579 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1580 }
1581
1582 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1583 {
1584 bool flag = !imm8->isZeroValue();
1585 SmallVector<Constant*,8> idx;
1586 for (unsigned i = 0; i < mVWidth; i++) {
1587 idx.push_back(C(i));
1588 }
1589 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1590
1591 SmallVector<Constant*,8> idx2;
1592 for (unsigned i = 0; i < mVWidth / 2; i++) {
1593 idx2.push_back(C(flag ? i : i + mVWidth));
1594 }
1595 for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1596 idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1597 }
1598 return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1599 }
1600
1601 // rdtsc buckets macros
1602 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1603 {
1604 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1605 // buckets framework when single threaded
1606 if (KNOB_SINGLE_THREADED)
1607 {
1608 std::vector<Type*> args{
1609 PointerType::get(mInt32Ty, 0), // pBucketMgr
1610 mInt32Ty // id
1611 };
1612
1613 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1614 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1615 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1616 {
1617 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1618 }
1619
1620 CALL(pFunc, { pBucketMgr, pId });
1621 }
1622 }
1623
1624 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1625 {
1626 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1627 // buckets framework when single threaded
1628 if (KNOB_SINGLE_THREADED)
1629 {
1630 std::vector<Type*> args{
1631 PointerType::get(mInt32Ty, 0), // pBucketMgr
1632 mInt32Ty // id
1633 };
1634
1635 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1636 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1637 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1638 {
1639 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1640 }
1641
1642 CALL(pFunc, { pBucketMgr, pId });
1643 }
1644 }
1645
1646
1647 uint32_t Builder::GetTypeSize(Type* pType)
1648 {
1649 if (pType->isStructTy())
1650 {
1651 uint32_t numElems = pType->getStructNumElements();
1652 Type* pElemTy = pType->getStructElementType(0);
1653 return numElems * GetTypeSize(pElemTy);
1654 }
1655
1656 if (pType->isArrayTy())
1657 {
1658 uint32_t numElems = pType->getArrayNumElements();
1659 Type* pElemTy = pType->getArrayElementType();
1660 return numElems * GetTypeSize(pElemTy);
1661 }
1662
1663 if (pType->isIntegerTy())
1664 {
1665 uint32_t bitSize = pType->getIntegerBitWidth();
1666 return bitSize / 8;
1667 }
1668
1669 if (pType->isFloatTy())
1670 {
1671 return 4;
1672 }
1673
1674 if (pType->isHalfTy())
1675 {
1676 return 2;
1677 }
1678
1679 if (pType->isDoubleTy())
1680 {
1681 return 8;
1682 }
1683
1684 SWR_ASSERT(false, "Unimplemented type.");
1685 return 0;
1686 }
1687 }