swr/rast: SIMD16 Fetch - Fully widen 32-bit float vertex components
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "builder.h"
31 #include "common/rdtsc_buckets.h"
32
33 #include <cstdarg>
34
35 namespace SwrJit
36 {
37 void __cdecl CallPrint(const char* fmt, ...);
38
39 //////////////////////////////////////////////////////////////////////////
40 /// @brief Convert an IEEE 754 32-bit single precision float to an
41 /// 16 bit float with 5 exponent bits and a variable
42 /// number of mantissa bits.
43 /// @param val - 32-bit float
44 /// @todo Maybe move this outside of this file into a header?
45 static uint16_t ConvertFloat32ToFloat16(float val)
46 {
47 uint32_t sign, exp, mant;
48 uint32_t roundBits;
49
50 // Extract the sign, exponent, and mantissa
51 uint32_t uf = *(uint32_t*)&val;
52 sign = (uf & 0x80000000) >> 31;
53 exp = (uf & 0x7F800000) >> 23;
54 mant = uf & 0x007FFFFF;
55
56 // Check for out of range
57 if (std::isnan(val))
58 {
59 exp = 0x1F;
60 mant = 0x200;
61 sign = 1; // set the sign bit for NANs
62 }
63 else if (std::isinf(val))
64 {
65 exp = 0x1f;
66 mant = 0x0;
67 }
68 else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
69 {
70 exp = 0x1E;
71 mant = 0x3FF;
72 }
73 else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
74 {
75 mant |= 0x00800000;
76 for (; exp <= 0x70; mant >>= 1, exp++)
77 ;
78 exp = 0;
79 mant = mant >> 13;
80 }
81 else if (exp < 0x66) // Too small to represent -> Zero
82 {
83 exp = 0;
84 mant = 0;
85 }
86 else
87 {
88 // Saves bits that will be shifted off for rounding
89 roundBits = mant & 0x1FFFu;
90 // convert exponent and mantissa to 16 bit format
91 exp = exp - 0x70;
92 mant = mant >> 13;
93
94 // Essentially RTZ, but round up if off by only 1 lsb
95 if (roundBits == 0x1FFFu)
96 {
97 mant++;
98 // check for overflow
99 if ((mant & 0xC00u) != 0)
100 exp++;
101 // make sure only the needed bits are used
102 mant &= 0x3FF;
103 }
104 }
105
106 uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
107 return (uint16_t)tmpVal;
108 }
109
110 //////////////////////////////////////////////////////////////////////////
111 /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
112 /// float
113 /// @param val - 16-bit float
114 /// @todo Maybe move this outside of this file into a header?
115 static float ConvertFloat16ToFloat32(uint32_t val)
116 {
117 uint32_t result;
118 if ((val & 0x7fff) == 0)
119 {
120 result = ((uint32_t)(val & 0x8000)) << 16;
121 }
122 else if ((val & 0x7c00) == 0x7c00)
123 {
124 result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
125 result |= ((uint32_t)val & 0x8000) << 16;
126 }
127 else
128 {
129 uint32_t sign = (val & 0x8000) << 16;
130 uint32_t mant = (val & 0x3ff) << 13;
131 uint32_t exp = (val >> 10) & 0x1f;
132 if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
133 {
134 mant <<= 1;
135 while (mant < (0x400 << 13))
136 {
137 exp--;
138 mant <<= 1;
139 }
140 mant &= (0x3ff << 13);
141 }
142 exp = ((exp - 15 + 127) & 0xff) << 23;
143 result = sign | exp | mant;
144 }
145
146 return *(float*)&result;
147 }
148
149 Constant *Builder::C(bool i)
150 {
151 return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
152 }
153
154 Constant *Builder::C(char i)
155 {
156 return ConstantInt::get(IRB()->getInt8Ty(), i);
157 }
158
159 Constant *Builder::C(uint8_t i)
160 {
161 return ConstantInt::get(IRB()->getInt8Ty(), i);
162 }
163
164 Constant *Builder::C(int i)
165 {
166 return ConstantInt::get(IRB()->getInt32Ty(), i);
167 }
168
169 Constant *Builder::C(int64_t i)
170 {
171 return ConstantInt::get(IRB()->getInt64Ty(), i);
172 }
173
174 Constant *Builder::C(uint16_t i)
175 {
176 return ConstantInt::get(mInt16Ty,i);
177 }
178
179 Constant *Builder::C(uint32_t i)
180 {
181 return ConstantInt::get(IRB()->getInt32Ty(), i);
182 }
183
184 Constant *Builder::C(float i)
185 {
186 return ConstantFP::get(IRB()->getFloatTy(), i);
187 }
188
189 Constant *Builder::PRED(bool pred)
190 {
191 return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
192 }
193
194 Value *Builder::VIMMED1(int i)
195 {
196 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
197 }
198
199 Value *Builder::VIMMED1(uint32_t i)
200 {
201 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
202 }
203
204 Value *Builder::VIMMED1(float i)
205 {
206 return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
207 }
208
209 Value *Builder::VIMMED1(bool i)
210 {
211 return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
212 }
213
214 #if USE_SIMD16_BUILDER
215 Value *Builder::VIMMED2_1(int i)
216 {
217 return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
218 }
219
220 Value *Builder::VIMMED2_1(uint32_t i)
221 {
222 return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
223 }
224
225 Value *Builder::VIMMED2_1(float i)
226 {
227 return ConstantVector::getSplat(mVWidth2, cast<ConstantFP>(C(i)));
228 }
229
230 Value *Builder::VIMMED2_1(bool i)
231 {
232 return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
233 }
234
235 #endif
236 Value *Builder::VUNDEF_IPTR()
237 {
238 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
239 }
240
241 Value *Builder::VUNDEF_I()
242 {
243 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
244 }
245
246 Value *Builder::VUNDEF(Type *ty, uint32_t size)
247 {
248 return UndefValue::get(VectorType::get(ty, size));
249 }
250
251 Value *Builder::VUNDEF_F()
252 {
253 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
254 }
255
256 #if USE_SIMD16_BUILDER
257 Value *Builder::VUNDEF2_F()
258 {
259 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
260 }
261
262 Value *Builder::VUNDEF2_I()
263 {
264 return UndefValue::get(VectorType::get(mInt32Ty, mVWidth2));
265 }
266
267 #endif
268 Value *Builder::VUNDEF(Type* t)
269 {
270 return UndefValue::get(VectorType::get(t, mVWidth));
271 }
272
273 Value *Builder::VBROADCAST(Value *src)
274 {
275 // check if src is already a vector
276 if (src->getType()->isVectorTy())
277 {
278 return src;
279 }
280
281 return VECTOR_SPLAT(mVWidth, src);
282 }
283
284 #if USE_SIMD16_BUILDER
285 Value *Builder::VBROADCAST2(Value *src)
286 {
287 // check if src is already a vector
288 if (src->getType()->isVectorTy())
289 {
290 return src;
291 }
292
293 return VECTOR_SPLAT(mVWidth2, src);
294 }
295
296 #endif
297 uint32_t Builder::IMMED(Value* v)
298 {
299 SWR_ASSERT(isa<ConstantInt>(v));
300 ConstantInt *pValConst = cast<ConstantInt>(v);
301 return pValConst->getZExtValue();
302 }
303
304 int32_t Builder::S_IMMED(Value* v)
305 {
306 SWR_ASSERT(isa<ConstantInt>(v));
307 ConstantInt *pValConst = cast<ConstantInt>(v);
308 return pValConst->getSExtValue();
309 }
310
311 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
312 {
313 std::vector<Value*> indices;
314 for (auto i : indexList)
315 indices.push_back(i);
316 return GEPA(ptr, indices);
317 }
318
319 Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
320 {
321 std::vector<Value*> indices;
322 for (auto i : indexList)
323 indices.push_back(C(i));
324 return GEPA(ptr, indices);
325 }
326
327 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
328 {
329 std::vector<Value*> indices;
330 for (auto i : indexList)
331 indices.push_back(i);
332 return IN_BOUNDS_GEP(ptr, indices);
333 }
334
335 Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
336 {
337 std::vector<Value*> indices;
338 for (auto i : indexList)
339 indices.push_back(C(i));
340 return IN_BOUNDS_GEP(ptr, indices);
341 }
342
343 LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
344 {
345 std::vector<Value*> valIndices;
346 for (auto i : indices)
347 valIndices.push_back(C(i));
348 return LOAD(GEPA(basePtr, valIndices), name);
349 }
350
351 LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
352 {
353 std::vector<Value*> valIndices;
354 for (auto i : indices)
355 valIndices.push_back(i);
356 return LOAD(GEPA(basePtr, valIndices), name);
357 }
358
359 StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
360 {
361 std::vector<Value*> valIndices;
362 for (auto i : indices)
363 valIndices.push_back(C(i));
364 return STORE(val, GEPA(basePtr, valIndices));
365 }
366
367 StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
368 {
369 std::vector<Value*> valIndices;
370 for (auto i : indices)
371 valIndices.push_back(i);
372 return STORE(val, GEPA(basePtr, valIndices));
373 }
374
375 CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
376 {
377 std::vector<Value*> args;
378 for (auto arg : argsList)
379 args.push_back(arg);
380 return CALLA(Callee, args);
381 }
382
383 CallInst *Builder::CALL(Value *Callee, Value* arg)
384 {
385 std::vector<Value*> args;
386 args.push_back(arg);
387 return CALLA(Callee, args);
388 }
389
390 CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
391 {
392 std::vector<Value*> args;
393 args.push_back(arg1);
394 args.push_back(arg2);
395 return CALLA(Callee, args);
396 }
397
398 CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
399 {
400 std::vector<Value*> args;
401 args.push_back(arg1);
402 args.push_back(arg2);
403 args.push_back(arg3);
404 return CALLA(Callee, args);
405 }
406
407 //////////////////////////////////////////////////////////////////////////
408 Value *Builder::DEBUGTRAP()
409 {
410 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
411 return CALL(func);
412 }
413
414 Value *Builder::VRCP(Value *va)
415 {
416 return FDIV(VIMMED1(1.0f), va); // 1 / a
417 }
418
419 Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
420 {
421 Value* vOut = FMADDPS(vA, vX, vC);
422 vOut = FMADDPS(vB, vY, vOut);
423 return vOut;
424 }
425
426 //////////////////////////////////////////////////////////////////////////
427 /// @brief Generate an i32 masked load operation in LLVM IR. If not
428 /// supported on the underlying platform, emulate it with float masked load
429 /// @param src - base address pointer for the load
430 /// @param vMask - SIMD wide mask that controls whether to access memory load 0
431 Value *Builder::MASKLOADD(Value* src,Value* mask)
432 {
433 Value* vResult;
434 // use avx2 gather instruction is available
435 if(JM()->mArch.AVX2())
436 {
437 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
438 vResult = CALL(func,{src,mask});
439 }
440 else
441 {
442 // maskload intrinsic expects integer mask operand in llvm >= 3.8
443 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
444 mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
445 #else
446 mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
447 #endif
448 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
449 vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
450 }
451 return vResult;
452 }
453
454 //////////////////////////////////////////////////////////////////////////
455 /// @brief insert a JIT call to CallPrint
456 /// - outputs formatted string to both stdout and VS output window
457 /// - DEBUG builds only
458 /// Usage example:
459 /// PRINT("index %d = 0x%p\n",{C(lane), pIndex});
460 /// where C(lane) creates a constant value to print, and pIndex is the Value*
461 /// result from a GEP, printing out the pointer to memory
462 /// @param printStr - constant string to print, which includes format specifiers
463 /// @param printArgs - initializer list of Value*'s to print to std out
464 CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
465 {
466 // push the arguments to CallPrint into a vector
467 std::vector<Value*> printCallArgs;
468 // save room for the format string. we still need to modify it for vectors
469 printCallArgs.resize(1);
470
471 // search through the format string for special processing
472 size_t pos = 0;
473 std::string tempStr(printStr);
474 pos = tempStr.find('%', pos);
475 auto v = printArgs.begin();
476
477 while ((pos != std::string::npos) && (v != printArgs.end()))
478 {
479 Value* pArg = *v;
480 Type* pType = pArg->getType();
481
482 if (pType->isVectorTy())
483 {
484 Type* pContainedType = pType->getContainedType(0);
485
486 if (toupper(tempStr[pos + 1]) == 'X')
487 {
488 tempStr[pos] = '0';
489 tempStr[pos + 1] = 'x';
490 tempStr.insert(pos + 2, "%08X ");
491 pos += 7;
492
493 printCallArgs.push_back(VEXTRACT(pArg, C(0)));
494
495 std::string vectorFormatStr;
496 for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
497 {
498 vectorFormatStr += "0x%08X ";
499 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
500 }
501
502 tempStr.insert(pos, vectorFormatStr);
503 pos += vectorFormatStr.size();
504 }
505 else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
506 {
507 uint32_t i = 0;
508 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
509 {
510 tempStr.insert(pos, std::string("%f "));
511 pos += 3;
512 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
513 }
514 printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
515 }
516 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
517 {
518 uint32_t i = 0;
519 for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
520 {
521 tempStr.insert(pos, std::string("%d "));
522 pos += 3;
523 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
524 }
525 printCallArgs.push_back(VEXTRACT(pArg, C(i)));
526 }
527 }
528 else
529 {
530 if (toupper(tempStr[pos + 1]) == 'X')
531 {
532 tempStr[pos] = '0';
533 tempStr.insert(pos + 1, "x%08");
534 printCallArgs.push_back(pArg);
535 pos += 3;
536 }
537 // for %f we need to cast float Values to doubles so that they print out correctly
538 else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
539 {
540 printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
541 pos++;
542 }
543 else
544 {
545 printCallArgs.push_back(pArg);
546 }
547 }
548
549 // advance to the next arguement
550 v++;
551 pos = tempStr.find('%', ++pos);
552 }
553
554 // create global variable constant string
555 Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
556 GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
557 JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
558
559 // get a pointer to the first character in the constant string array
560 std::vector<Constant*> geplist{C(0),C(0)};
561 Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
562
563 // insert the pointer to the format string in the argument vector
564 printCallArgs[0] = strGEP;
565
566 // get pointer to CallPrint function and insert decl into the module if needed
567 std::vector<Type*> args;
568 args.push_back(PointerType::get(mInt8Ty,0));
569 FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
570 Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
571
572 // if we haven't yet added the symbol to the symbol table
573 if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
574 {
575 sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
576 }
577
578 // insert a call to CallPrint
579 return CALLA(callPrintFn,printCallArgs);
580 }
581
582 //////////////////////////////////////////////////////////////////////////
583 /// @brief Wrapper around PRINT with initializer list.
584 CallInst* Builder::PRINT(const std::string &printStr)
585 {
586 return PRINT(printStr, {});
587 }
588
589 //////////////////////////////////////////////////////////////////////////
590 /// @brief Generate a masked gather operation in LLVM IR. If not
591 /// supported on the underlying platform, emulate it with loads
592 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
593 /// @param pBase - Int8* base VB address pointer value
594 /// @param vIndices - SIMD wide value of VB byte offsets
595 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
596 /// @param scale - value to scale indices by
597 Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
598 {
599 Value *vGather;
600
601 // use avx2 gather instruction if available
602 if(JM()->mArch.AVX2())
603 {
604 // force mask to <N x float>, required by vgather
605 Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
606
607 vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
608 }
609 else
610 {
611 Value* pStack = STACKSAVE();
612
613 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
614 Value* vSrcPtr = ALLOCA(vSrc->getType());
615 STORE(vSrc, vSrcPtr);
616
617 vGather = VUNDEF_F();
618 Value *vScaleVec = VIMMED1((uint32_t)scale);
619 Value *vOffsets = MUL(vIndices,vScaleVec);
620 for(uint32_t i = 0; i < mVWidth; ++i)
621 {
622 // single component byte index
623 Value *offset = VEXTRACT(vOffsets,C(i));
624 // byte pointer to component
625 Value *loadAddress = GEP(pBase,offset);
626 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
627 // pointer to the value to load if we're masking off a component
628 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
629 Value *selMask = VEXTRACT(vMask,C(i));
630 // switch in a safe address to load if we're trying to access a vertex
631 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
632 Value *val = LOAD(validAddress);
633 vGather = VINSERT(vGather,val,C(i));
634 }
635 STACKRESTORE(pStack);
636 }
637
638 return vGather;
639 }
640
641 #if USE_SIMD16_BUILDER
642 Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
643 {
644 Value *vGather = VUNDEF2_F();
645
646 // use avx512 gather instruction if available
647 if (JM()->mArch.AVX512F())
648 {
649 // force mask to <N-bit Integer>, required by vgather2
650 Value *mask = BITCAST(vMask, mInt16Ty);
651
652 vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
653 }
654 else
655 {
656 Value *src0 = EXTRACT2_F(vSrc, 0);
657 Value *src1 = EXTRACT2_F(vSrc, 1);
658
659 Value *indices0 = EXTRACT2_I(vIndices, 0);
660 Value *indices1 = EXTRACT2_I(vIndices, 1);
661
662 Value *vmask16 = VMASK2(vMask);
663
664 Value *mask0 = MASK(EXTRACT2_I(vmask16, 0)); // TODO: do this better..
665 Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
666
667 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
668 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
669
670 vGather = INSERT2_F(vGather, gather0, 0);
671 vGather = INSERT2_F(vGather, gather1, 1);
672 }
673
674 return vGather;
675 }
676
677 #endif
678 //////////////////////////////////////////////////////////////////////////
679 /// @brief Generate a masked gather operation in LLVM IR. If not
680 /// supported on the underlying platform, emulate it with loads
681 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
682 /// @param pBase - Int8* base VB address pointer value
683 /// @param vIndices - SIMD wide value of VB byte offsets
684 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
685 /// @param scale - value to scale indices by
686 Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
687 {
688 Value* vGather;
689
690 // use avx2 gather instruction if available
691 if(JM()->mArch.AVX2())
692 {
693 vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
694 }
695 else
696 {
697 Value* pStack = STACKSAVE();
698
699 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
700 Value* vSrcPtr = ALLOCA(vSrc->getType());
701 STORE(vSrc, vSrcPtr);
702
703 vGather = VUNDEF_I();
704 Value *vScaleVec = VIMMED1((uint32_t)scale);
705 Value *vOffsets = MUL(vIndices, vScaleVec);
706 for(uint32_t i = 0; i < mVWidth; ++i)
707 {
708 // single component byte index
709 Value *offset = VEXTRACT(vOffsets, C(i));
710 // byte pointer to component
711 Value *loadAddress = GEP(pBase, offset);
712 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
713 // pointer to the value to load if we're masking off a component
714 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
715 Value *selMask = VEXTRACT(vMask, C(i));
716 // switch in a safe address to load if we're trying to access a vertex
717 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
718 Value *val = LOAD(validAddress, C(0));
719 vGather = VINSERT(vGather, val, C(i));
720 }
721
722 STACKRESTORE(pStack);
723 }
724 return vGather;
725 }
726
727 //////////////////////////////////////////////////////////////////////////
728 /// @brief Generate a masked gather operation in LLVM IR. If not
729 /// supported on the underlying platform, emulate it with loads
730 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
731 /// @param pBase - Int8* base VB address pointer value
732 /// @param vIndices - SIMD wide value of VB byte offsets
733 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
734 /// @param scale - value to scale indices by
735 Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
736 {
737 Value* vGather;
738
739 // use avx2 gather instruction if available
740 if(JM()->mArch.AVX2())
741 {
742 vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
743 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
744 }
745 else
746 {
747 Value* pStack = STACKSAVE();
748
749 // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
750 Value* vSrcPtr = ALLOCA(vSrc->getType());
751 STORE(vSrc, vSrcPtr);
752
753 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
754 Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
755 Value *vOffsets = MUL(vIndices,vScaleVec);
756 for(uint32_t i = 0; i < mVWidth/2; ++i)
757 {
758 // single component byte index
759 Value *offset = VEXTRACT(vOffsets,C(i));
760 // byte pointer to component
761 Value *loadAddress = GEP(pBase,offset);
762 loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
763 // pointer to the value to load if we're masking off a component
764 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
765 Value *selMask = VEXTRACT(vMask,C(i));
766 // switch in a safe address to load if we're trying to access a vertex
767 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
768 Value *val = LOAD(validAddress);
769 vGather = VINSERT(vGather,val,C(i));
770 }
771 STACKRESTORE(pStack);
772 }
773 return vGather;
774 }
775
776 #if USE_SIMD16_BUILDER
777 Value *Builder::PSRLI(Value *a, Value *imm)
778 {
779 return VPSRLI(a, imm);
780 }
781
782 Value *Builder::PSRLI_16(Value *a, Value *imm)
783 {
784 Value *result = VUNDEF2_I();
785
786 // use avx512 shift right instruction if available
787 if (JM()->mArch.AVX512F())
788 {
789 result = VPSRLI_16(a, imm);
790 }
791 else
792 {
793 Value *a0 = EXTRACT2_I(a, 0);
794 Value *a1 = EXTRACT2_I(a, 1);
795
796 Value *result0 = PSRLI(a0, imm);
797 Value *result1 = PSRLI(a1, imm);
798
799 result = INSERT2_I(result, result0, 0);
800 result = INSERT2_I(result, result1, 1);
801 }
802
803 return result;
804 }
805
806 #endif
807 #if USE_SIMD16_BUILDER
808 //////////////////////////////////////////////////////////////////////////
809 /// @brief
810 Value *Builder::EXTRACT2_F(Value *a2, uint32_t imm)
811 {
812 const uint32_t i0 = (imm > 0) ? mVWidth : 0;
813
814 Value *result = VUNDEF_F();
815
816 for (uint32_t i = 0; i < mVWidth; i += 1)
817 {
818 #if 1
819 if (!a2->getType()->getScalarType()->isFloatTy())
820 {
821 a2 = BITCAST(a2, mSimd2FP32Ty);
822 }
823
824 #endif
825 Value *temp = VEXTRACT(a2, C(i0 + i));
826
827 result = VINSERT(result, temp, C(i));
828 }
829
830 return result;
831 }
832
833 Value *Builder::EXTRACT2_I(Value *a2, uint32_t imm)
834 {
835 return BITCAST(EXTRACT2_F(a2, imm), mSimdInt32Ty);
836 }
837
838 //////////////////////////////////////////////////////////////////////////
839 /// @brief
840 Value *Builder::INSERT2_F(Value *a2, Value *b, uint32_t imm)
841 {
842 const uint32_t i0 = (imm > 0) ? mVWidth : 0;
843
844 Value *result = BITCAST(a2, mSimd2FP32Ty);
845
846 for (uint32_t i = 0; i < mVWidth; i += 1)
847 {
848 #if 1
849 if (!b->getType()->getScalarType()->isFloatTy())
850 {
851 b = BITCAST(b, mSimdFP32Ty);
852 }
853
854 #endif
855 Value *temp = VEXTRACT(b, C(i));
856
857 result = VINSERT(result, temp, C(i0 + i));
858 }
859
860 return result;
861 }
862
863 Value *Builder::INSERT2_I(Value *a2, Value *b, uint32_t imm)
864 {
865 return BITCAST(INSERT2_F(a2, b, imm), mSimd2Int32Ty);
866 }
867
868 #endif
869 //////////////////////////////////////////////////////////////////////////
870 /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
871 Value *Builder::MASK(Value *vmask)
872 {
873 Value *src = BITCAST(vmask, mSimdInt32Ty);
874 return ICMP_SLT(src, VIMMED1(0));
875 }
876
877 #if USE_SIMD16_BUILDER
878 Value *Builder::MASK2(Value *vmask)
879 {
880 Value *src = BITCAST(vmask, mSimd2Int32Ty);
881 return ICMP_SLT(src, VIMMED2_1(0));
882 }
883
884 #endif
885 //////////////////////////////////////////////////////////////////////////
886 /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
887 Value *Builder::VMASK(Value *mask)
888 {
889 return S_EXT(mask, mSimdInt32Ty);
890 }
891
892 #if USE_SIMD16_BUILDER
893 Value *Builder::VMASK2(Value *mask)
894 {
895 return S_EXT(mask, mSimd2Int32Ty);
896 }
897
898 #endif
899 //////////////////////////////////////////////////////////////////////////
900 /// @brief Generate a VPSHUFB operation in LLVM IR. If not
901 /// supported on the underlying platform, emulate it
902 /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
903 /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
904 /// Byte masks in lower 128 lane of b selects 8 bit values from lower
905 /// 128bits of a, and vice versa for the upper lanes. If the mask
906 /// value is negative, '0' is inserted.
907 Value *Builder::PSHUFB(Value* a, Value* b)
908 {
909 Value* res;
910 // use avx2 pshufb instruction if available
911 if(JM()->mArch.AVX2())
912 {
913 res = VPSHUFB(a, b);
914 }
915 else
916 {
917 Constant* cB = dyn_cast<Constant>(b);
918 // number of 8 bit elements in b
919 uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
920 // output vector
921 Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
922
923 // insert an 8 bit value from the high and low lanes of a per loop iteration
924 numElms /= 2;
925 for(uint32_t i = 0; i < numElms; i++)
926 {
927 ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
928 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
929
930 // extract values from constant mask
931 char valLow128bLane = (char)(cLow128b->getSExtValue());
932 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
933
934 Value* insertValLow128b;
935 Value* insertValHigh128b;
936
937 // if the mask value is negative, insert a '0' in the respective output position
938 // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
939 insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
940 insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
941
942 vShuf = VINSERT(vShuf, insertValLow128b, i);
943 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
944 }
945 res = vShuf;
946 }
947 return res;
948 }
949
950 //////////////////////////////////////////////////////////////////////////
951 /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
952 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
953 /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only
954 /// lower 8 values are used.
955 Value *Builder::PMOVSXBD(Value* a)
956 {
957 // VPMOVSXBD output type
958 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
959 // Extract 8 values from 128bit lane and sign extend
960 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
961 }
962
963 //////////////////////////////////////////////////////////////////////////
964 /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
965 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it
966 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
967 Value *Builder::PMOVSXWD(Value* a)
968 {
969 // VPMOVSXWD output type
970 Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
971 // Extract 8 values from 128bit lane and sign extend
972 return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
973 }
974
975 //////////////////////////////////////////////////////////////////////////
976 /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
977 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
978 /// platform, emulate it
979 /// @param a - 256bit SIMD lane(8x32bit) of integer values.
980 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
981 Value *Builder::PERMD(Value* a, Value* idx)
982 {
983 Value* res;
984 // use avx2 permute instruction if available
985 if(JM()->mArch.AVX2())
986 {
987 res = VPERMD(a, idx);
988 }
989 else
990 {
991 if (isa<Constant>(idx))
992 {
993 res = VSHUFFLE(a, a, idx);
994 }
995 else
996 {
997 res = VUNDEF_I();
998 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
999 {
1000 Value* pIndex = VEXTRACT(idx, C(l));
1001 Value* pVal = VEXTRACT(a, pIndex);
1002 res = VINSERT(res, pVal, C(l));
1003 }
1004 }
1005 }
1006 return res;
1007 }
1008
1009 //////////////////////////////////////////////////////////////////////////
1010 /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
1011 /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
1012 /// platform, emulate it
1013 /// @param a - 256bit SIMD lane(8x32bit) of float values.
1014 /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
1015 Value *Builder::PERMPS(Value* a, Value* idx)
1016 {
1017 Value* res;
1018 // use avx2 permute instruction if available
1019 if (JM()->mArch.AVX2())
1020 {
1021 // llvm 3.6.0 swapped the order of the args to vpermd
1022 res = VPERMPS(idx, a);
1023 }
1024 else
1025 {
1026 if (isa<Constant>(idx))
1027 {
1028 res = VSHUFFLE(a, a, idx);
1029 }
1030 else
1031 {
1032 res = VUNDEF_F();
1033 for (uint32_t l = 0; l < JM()->mVWidth; ++l)
1034 {
1035 Value* pIndex = VEXTRACT(idx, C(l));
1036 Value* pVal = VEXTRACT(a, pIndex);
1037 res = VINSERT(res, pVal, C(l));
1038 }
1039 }
1040 }
1041
1042 return res;
1043 }
1044
1045 //////////////////////////////////////////////////////////////////////////
1046 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
1047 /// in LLVM IR. If not supported on the underlying platform, emulate it
1048 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
1049 Value *Builder::CVTPH2PS(Value* a)
1050 {
1051 if (JM()->mArch.F16C())
1052 {
1053 return VCVTPH2PS(a);
1054 }
1055 else
1056 {
1057 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
1058 Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
1059
1060 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
1061 {
1062 sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
1063 }
1064
1065 Value* pResult = UndefValue::get(mSimdFP32Ty);
1066 for (uint32_t i = 0; i < mVWidth; ++i)
1067 {
1068 Value* pSrc = VEXTRACT(a, C(i));
1069 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
1070 pResult = VINSERT(pResult, pConv, C(i));
1071 }
1072
1073 return pResult;
1074 }
1075 }
1076
1077 //////////////////////////////////////////////////////////////////////////
1078 /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
1079 /// in LLVM IR. If not supported on the underlying platform, emulate it
1080 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
1081 Value *Builder::CVTPS2PH(Value* a, Value* rounding)
1082 {
1083 if (JM()->mArch.F16C())
1084 {
1085 return VCVTPS2PH(a, rounding);
1086 }
1087 else
1088 {
1089 // call scalar C function for now
1090 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
1091 Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
1092
1093 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
1094 {
1095 sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
1096 }
1097
1098 Value* pResult = UndefValue::get(mSimdInt16Ty);
1099 for (uint32_t i = 0; i < mVWidth; ++i)
1100 {
1101 Value* pSrc = VEXTRACT(a, C(i));
1102 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
1103 pResult = VINSERT(pResult, pConv, C(i));
1104 }
1105
1106 return pResult;
1107 }
1108 }
1109
1110 Value *Builder::PMAXSD(Value* a, Value* b)
1111 {
1112 Value* cmp = ICMP_SGT(a, b);
1113 return SELECT(cmp, a, b);
1114 }
1115
1116 Value *Builder::PMINSD(Value* a, Value* b)
1117 {
1118 Value* cmp = ICMP_SLT(a, b);
1119 return SELECT(cmp, a, b);
1120 }
1121
1122 void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
1123 Value* mask, Value* vGatherComponents[], bool bPackedOutput)
1124 {
1125 const SWR_FORMAT_INFO &info = GetFormatInfo(format);
1126 if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
1127 {
1128 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1129 }
1130 else
1131 {
1132 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
1133 }
1134 }
1135
1136 void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1137 Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1138 {
1139 switch(info.bpp / info.numComps)
1140 {
1141 case 16:
1142 {
1143 Value* vGatherResult[2];
1144
1145 // TODO: vGatherMaskedVal
1146 Value* vGatherMaskedVal = VIMMED1((float)0);
1147
1148 // always have at least one component out of x or y to fetch
1149
1150 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1151 // e.g. result of first 8x32bit integer gather for 16bit components
1152 // 256i - 0 1 2 3 4 5 6 7
1153 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1154 //
1155
1156 // if we have at least one component out of x or y to fetch
1157 if(info.numComps > 2)
1158 {
1159 // offset base to the next components(zw) in the vertex to gather
1160 pSrcBase = GEP(pSrcBase, C((char)4));
1161
1162 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1163 // e.g. result of second 8x32bit integer gather for 16bit components
1164 // 256i - 0 1 2 3 4 5 6 7
1165 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1166 //
1167 }
1168 else
1169 {
1170 vGatherResult[1] = vGatherMaskedVal;
1171 }
1172
1173 // Shuffle gathered components into place, each row is a component
1174 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1175 }
1176 break;
1177 case 32:
1178 {
1179 // apply defaults
1180 for (uint32_t i = 0; i < 4; ++i)
1181 {
1182 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
1183 }
1184
1185 for(uint32_t i = 0; i < info.numComps; i++)
1186 {
1187 uint32_t swizzleIndex = info.swizzle[i];
1188
1189 // Gather a SIMD of components
1190 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1191
1192 // offset base to the next component to gather
1193 pSrcBase = GEP(pSrcBase, C((char)4));
1194 }
1195 }
1196 break;
1197 default:
1198 SWR_INVALID("Invalid float format");
1199 break;
1200 }
1201 }
1202
1203 void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
1204 Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
1205 {
1206 switch (info.bpp / info.numComps)
1207 {
1208 case 8:
1209 {
1210 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1211 Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1212 // e.g. result of an 8x32bit integer gather for 8bit components
1213 // 256i - 0 1 2 3 4 5 6 7
1214 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
1215
1216 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1217 }
1218 break;
1219 case 16:
1220 {
1221 Value* vGatherResult[2];
1222
1223 // TODO: vGatherMaskedVal
1224 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
1225
1226 // always have at least one component out of x or y to fetch
1227
1228 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1229 // e.g. result of first 8x32bit integer gather for 16bit components
1230 // 256i - 0 1 2 3 4 5 6 7
1231 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
1232 //
1233
1234 // if we have at least one component out of x or y to fetch
1235 if(info.numComps > 2)
1236 {
1237 // offset base to the next components(zw) in the vertex to gather
1238 pSrcBase = GEP(pSrcBase, C((char)4));
1239
1240 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
1241 // e.g. result of second 8x32bit integer gather for 16bit components
1242 // 256i - 0 1 2 3 4 5 6 7
1243 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1244 //
1245 }
1246 else
1247 {
1248 vGatherResult[1] = vGatherMaskedVal;
1249 }
1250
1251 // Shuffle gathered components into place, each row is a component
1252 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
1253
1254 }
1255 break;
1256 case 32:
1257 {
1258 // apply defaults
1259 for (uint32_t i = 0; i < 4; ++i)
1260 {
1261 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
1262 }
1263
1264 for(uint32_t i = 0; i < info.numComps; i++)
1265 {
1266 uint32_t swizzleIndex = info.swizzle[i];
1267
1268 // Gather a SIMD of components
1269 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
1270
1271 // offset base to the next component to gather
1272 pSrcBase = GEP(pSrcBase, C((char)4));
1273 }
1274 }
1275 break;
1276 default:
1277 SWR_INVALID("unsupported format");
1278 break;
1279 }
1280 }
1281
1282 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
1283 {
1284 // cast types
1285 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1286 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1287
1288 // input could either be float or int vector; do shuffle work in int
1289 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
1290 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
1291
1292 if(bPackedOutput)
1293 {
1294 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1295
1296 // shuffle mask
1297 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1298 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1299 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
1300 // after pshufb: group components together in each 128bit lane
1301 // 256i - 0 1 2 3 4 5 6 7
1302 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1303
1304 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1305 // after PERMD: move and pack xy components into each 128bit lane
1306 // 256i - 0 1 2 3 4 5 6 7
1307 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1308
1309 // do the same for zw components
1310 Value* vi128ZW = nullptr;
1311 if(info.numComps > 2)
1312 {
1313 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
1314 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1315 }
1316
1317 for(uint32_t i = 0; i < 4; i++)
1318 {
1319 uint32_t swizzleIndex = info.swizzle[i];
1320 // todo: fixed for packed
1321 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1322 if(i >= info.numComps)
1323 {
1324 // set the default component val
1325 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1326 continue;
1327 }
1328
1329 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1330 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1331 // if x or y, use vi128XY permute result, else use vi128ZW
1332 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1333
1334 // extract packed component 128 bit lanes
1335 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1336 }
1337
1338 }
1339 else
1340 {
1341 // pshufb masks for each component
1342 Value* vConstMask[2];
1343 // x/z shuffle mask
1344 vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1345 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
1346
1347 // y/w shuffle mask
1348 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1349 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1350
1351
1352 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1353 // apply defaults
1354 for (uint32_t i = 0; i < 4; ++i)
1355 {
1356 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1357 }
1358
1359 for(uint32_t i = 0; i < info.numComps; i++)
1360 {
1361 uint32_t swizzleIndex = info.swizzle[i];
1362
1363 // select correct constMask for x/z or y/w pshufb
1364 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1365 // if x or y, use vi128XY permute result, else use vi128ZW
1366 uint32_t selectedGather = (i < 2) ? 0 : 1;
1367
1368 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
1369 // after pshufb mask for x channel; z uses the same shuffle from the second gather
1370 // 256i - 0 1 2 3 4 5 6 7
1371 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1372 }
1373 }
1374 }
1375
1376 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
1377 {
1378 // cast types
1379 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1380 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
1381
1382 if(bPackedOutput)
1383 {
1384 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
1385 // shuffle mask
1386 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1387 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
1388 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1389 // after pshufb: group components together in each 128bit lane
1390 // 256i - 0 1 2 3 4 5 6 7
1391 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1392
1393 Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
1394 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1395 // 256i - 0 1 2 3 4 5 6 7
1396 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1397
1398 // do the same for zw components
1399 Value* vi128ZW = nullptr;
1400 if(info.numComps > 2)
1401 {
1402 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
1403 }
1404
1405 // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
1406 for(uint32_t i = 0; i < 4; i++)
1407 {
1408 uint32_t swizzleIndex = info.swizzle[i];
1409 // todo: fix for packed
1410 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
1411 if(i >= info.numComps)
1412 {
1413 // set the default component val
1414 vGatherOutput[swizzleIndex] = vGatherMaskedVal;
1415 continue;
1416 }
1417
1418 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1419 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1420 // if x or y, use vi128XY permute result, else use vi128ZW
1421 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1422
1423 // sign extend
1424 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
1425 }
1426 }
1427 // else zero extend
1428 else{
1429 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1430 // apply defaults
1431 for (uint32_t i = 0; i < 4; ++i)
1432 {
1433 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
1434 }
1435
1436 for(uint32_t i = 0; i < info.numComps; i++){
1437 uint32_t swizzleIndex = info.swizzle[i];
1438
1439 // pshufb masks for each component
1440 Value* vConstMask;
1441 switch(i)
1442 {
1443 case 0:
1444 // x shuffle mask
1445 vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1446 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1447 break;
1448 case 1:
1449 // y shuffle mask
1450 vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1451 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1452 break;
1453 case 2:
1454 // z shuffle mask
1455 vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1456 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1457 break;
1458 case 3:
1459 // w shuffle mask
1460 vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1461 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1462 break;
1463 default:
1464 vConstMask = nullptr;
1465 break;
1466 }
1467
1468 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
1469 // after pshufb for x channel
1470 // 256i - 0 1 2 3 4 5 6 7
1471 // x000 x000 x000 x000 x000 x000 x000 x000
1472 }
1473 }
1474 }
1475
1476 // Helper function to create alloca in entry block of function
1477 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
1478 {
1479 auto saveIP = IRB()->saveIP();
1480 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1481 pFunc->getEntryBlock().begin());
1482 Value* pAlloca = ALLOCA(pType);
1483 if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1484 return pAlloca;
1485 }
1486
1487 Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
1488 {
1489 auto saveIP = IRB()->saveIP();
1490 IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
1491 pFunc->getEntryBlock().begin());
1492 Value* pAlloca = ALLOCA(pType, pArraySize);
1493 if (saveIP.isSet()) IRB()->restoreIP(saveIP);
1494 return pAlloca;
1495 }
1496
1497 //////////////////////////////////////////////////////////////////////////
1498 /// @brief emulates a scatter operation.
1499 /// @param pDst - pointer to destination
1500 /// @param vSrc - vector of src data to scatter
1501 /// @param vOffsets - vector of byte offsets from pDst
1502 /// @param vMask - mask of valid lanes
1503 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
1504 {
1505 /* Scatter algorithm
1506
1507 while(Index = BitScanForward(mask))
1508 srcElem = srcVector[Index]
1509 offsetElem = offsetVector[Index]
1510 *(pDst + offsetElem) = srcElem
1511 Update mask (&= ~(1<<Index)
1512
1513 */
1514
1515 BasicBlock* pCurBB = IRB()->GetInsertBlock();
1516 Function* pFunc = pCurBB->getParent();
1517 Type* pSrcTy = vSrc->getType()->getVectorElementType();
1518
1519 // Store vectors on stack
1520 if (pScatterStackSrc == nullptr)
1521 {
1522 // Save off stack allocations and reuse per scatter. Significantly reduces stack
1523 // requirements for shaders with a lot of scatters.
1524 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
1525 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
1526 }
1527
1528 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
1529 Value* pOffsetsArrayPtr = pScatterStackOffsets;
1530 STORE(vSrc, pSrcArrayPtr);
1531 STORE(vOffsets, pOffsetsArrayPtr);
1532
1533 // Cast to pointers for random access
1534 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
1535 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
1536
1537 Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
1538
1539 // Get cttz function
1540 Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
1541
1542 // Setup loop basic block
1543 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
1544
1545 // compute first set bit
1546 Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
1547
1548 Value* pIsUndef = ICMP_EQ(pIndex, C(32));
1549
1550 // Split current block
1551 BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
1552
1553 // Remove unconditional jump created by splitBasicBlock
1554 pCurBB->getTerminator()->eraseFromParent();
1555
1556 // Add terminator to end of original block
1557 IRB()->SetInsertPoint(pCurBB);
1558
1559 // Add conditional branch
1560 COND_BR(pIsUndef, pPostLoop, pLoop);
1561
1562 // Add loop basic block contents
1563 IRB()->SetInsertPoint(pLoop);
1564 PHINode* pIndexPhi = PHI(mInt32Ty, 2);
1565 PHINode* pMaskPhi = PHI(mInt32Ty, 2);
1566
1567 pIndexPhi->addIncoming(pIndex, pCurBB);
1568 pMaskPhi->addIncoming(pMask, pCurBB);
1569
1570 // Extract elements for this index
1571 Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
1572 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
1573
1574 // GEP to this offset in dst
1575 Value* pCurDst = GEP(pDst, pOffsetElem);
1576 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
1577 STORE(pSrcElem, pCurDst);
1578
1579 // Update the mask
1580 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
1581
1582 // Terminator
1583 Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
1584
1585 pIsUndef = ICMP_EQ(pNewIndex, C(32));
1586 COND_BR(pIsUndef, pPostLoop, pLoop);
1587
1588 // Update phi edges
1589 pIndexPhi->addIncoming(pNewIndex, pLoop);
1590 pMaskPhi->addIncoming(pNewMask, pLoop);
1591
1592 // Move builder to beginning of post loop
1593 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
1594 }
1595
1596 Value* Builder::VABSPS(Value* a)
1597 {
1598 Value* asInt = BITCAST(a, mSimdInt32Ty);
1599 Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
1600 return result;
1601 }
1602
1603 Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
1604 {
1605 Value *lowCmp = ICMP_SLT(src, low);
1606 Value *ret = SELECT(lowCmp, low, src);
1607
1608 Value *highCmp = ICMP_SGT(ret, high);
1609 ret = SELECT(highCmp, high, ret);
1610
1611 return ret;
1612 }
1613
1614 Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
1615 {
1616 Value *lowCmp = FCMP_OLT(src, low);
1617 Value *ret = SELECT(lowCmp, low, src);
1618
1619 Value *highCmp = FCMP_OGT(ret, high);
1620 ret = SELECT(highCmp, high, ret);
1621
1622 return ret;
1623 }
1624
1625 Value *Builder::FCLAMP(Value* src, float low, float high)
1626 {
1627 Value* result = VMAXPS(src, VIMMED1(low));
1628 result = VMINPS(result, VIMMED1(high));
1629
1630 return result;
1631 }
1632
1633 //////////////////////////////////////////////////////////////////////////
1634 /// @brief save/restore stack, providing ability to push/pop the stack and
1635 /// reduce overall stack requirements for temporary stack use
1636 Value* Builder::STACKSAVE()
1637 {
1638 Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
1639 return CALLA(pfnStackSave);
1640 }
1641
1642 void Builder::STACKRESTORE(Value* pSaved)
1643 {
1644 Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
1645 CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
1646 }
1647
1648 Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
1649 {
1650 Value* vOut;
1651 // use FMADs if available
1652 if(JM()->mArch.AVX2())
1653 {
1654 vOut = VFMADDPS(a, b, c);
1655 }
1656 else
1657 {
1658 vOut = FADD(FMUL(a, b), c);
1659 }
1660 return vOut;
1661 }
1662
1663 Value* Builder::POPCNT(Value* a)
1664 {
1665 Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
1666 return CALL(pCtPop, std::initializer_list<Value*>{a});
1667 }
1668
1669 //////////////////////////////////////////////////////////////////////////
1670 /// @brief C functions called by LLVM IR
1671 //////////////////////////////////////////////////////////////////////////
1672
1673 //////////////////////////////////////////////////////////////////////////
1674 /// @brief called in JIT code, inserted by PRINT
1675 /// output to both stdout and visual studio debug console
1676 void __cdecl CallPrint(const char* fmt, ...)
1677 {
1678 va_list args;
1679 va_start(args, fmt);
1680 vprintf(fmt, args);
1681
1682 #if defined( _WIN32 )
1683 char strBuf[1024];
1684 vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
1685 OutputDebugStringA(strBuf);
1686 #endif
1687
1688 va_end(args);
1689 }
1690
1691 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
1692 {
1693 bool flag = !imm8->isZeroValue();
1694 SmallVector<Constant*,8> idx;
1695 for (unsigned i = 0; i < mVWidth / 2; i++) {
1696 idx.push_back(C(flag ? i + mVWidth / 2 : i));
1697 }
1698 return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
1699 }
1700
1701 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
1702 {
1703 bool flag = !imm8->isZeroValue();
1704 SmallVector<Constant*,8> idx;
1705 for (unsigned i = 0; i < mVWidth; i++) {
1706 idx.push_back(C(i));
1707 }
1708 Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
1709
1710 SmallVector<Constant*,8> idx2;
1711 for (unsigned i = 0; i < mVWidth / 2; i++) {
1712 idx2.push_back(C(flag ? i : i + mVWidth));
1713 }
1714 for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
1715 idx2.push_back(C(flag ? i + mVWidth / 2 : i));
1716 }
1717 return VSHUFFLE(a, inter, ConstantVector::get(idx2));
1718 }
1719
1720 // rdtsc buckets macros
1721 void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
1722 {
1723 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1724 // buckets framework when single threaded
1725 if (KNOB_SINGLE_THREADED)
1726 {
1727 std::vector<Type*> args{
1728 PointerType::get(mInt32Ty, 0), // pBucketMgr
1729 mInt32Ty // id
1730 };
1731
1732 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1733 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
1734 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
1735 {
1736 sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
1737 }
1738
1739 CALL(pFunc, { pBucketMgr, pId });
1740 }
1741 }
1742
1743 void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
1744 {
1745 // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
1746 // buckets framework when single threaded
1747 if (KNOB_SINGLE_THREADED)
1748 {
1749 std::vector<Type*> args{
1750 PointerType::get(mInt32Ty, 0), // pBucketMgr
1751 mInt32Ty // id
1752 };
1753
1754 FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
1755 Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
1756 if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
1757 {
1758 sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
1759 }
1760
1761 CALL(pFunc, { pBucketMgr, pId });
1762 }
1763 }
1764
1765
1766 uint32_t Builder::GetTypeSize(Type* pType)
1767 {
1768 if (pType->isStructTy())
1769 {
1770 uint32_t numElems = pType->getStructNumElements();
1771 Type* pElemTy = pType->getStructElementType(0);
1772 return numElems * GetTypeSize(pElemTy);
1773 }
1774
1775 if (pType->isArrayTy())
1776 {
1777 uint32_t numElems = pType->getArrayNumElements();
1778 Type* pElemTy = pType->getArrayElementType();
1779 return numElems * GetTypeSize(pElemTy);
1780 }
1781
1782 if (pType->isIntegerTy())
1783 {
1784 uint32_t bitSize = pType->getIntegerBitWidth();
1785 return bitSize / 8;
1786 }
1787
1788 if (pType->isFloatTy())
1789 {
1790 return 4;
1791 }
1792
1793 if (pType->isHalfTy())
1794 {
1795 return 2;
1796 }
1797
1798 if (pType->isDoubleTy())
1799 {
1800 return 8;
1801 }
1802
1803 SWR_ASSERT(false, "Unimplemented type.");
1804 return 0;
1805 }
1806 }