src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32 #include "common/rdtsc_buckets.h"
  33
  34 #include <cstdarg>
  35
  36
  37 namespace SwrJit
  38 {
  39
  40     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
  41     {
  42         std::vector<Value*> indices;
  43         for (auto i : indexList)
  44             indices.push_back(i);
  45         return GEPA(ptr, indices);
  46     }
  47
  48     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
  49     {
  50         std::vector<Value*> indices;
  51         for (auto i : indexList)
  52             indices.push_back(C(i));
  53         return GEPA(ptr, indices);
  54     }
  55
  56     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
  57     {
  58         std::vector<Value*> indices;
  59         for (auto i : indexList)
  60             indices.push_back(i);
  61         return IN_BOUNDS_GEP(ptr, indices);
  62     }
  63
  64     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
  65     {
  66         std::vector<Value*> indices;
  67         for (auto i : indexList)
  68             indices.push_back(C(i));
  69         return IN_BOUNDS_GEP(ptr, indices);
  70     }
  71
  72     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
  73     {
  74         std::vector<Value*> valIndices;
  75         for (auto i : indices)
  76             valIndices.push_back(C(i));
  77         return LOAD(GEPA(basePtr, valIndices), name);
  78     }
  79
  80     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
  81     {
  82         std::vector<Value*> valIndices;
  83         for (auto i : indices)
  84             valIndices.push_back(i);
  85         return LOAD(GEPA(basePtr, valIndices), name);
  86     }
  87
  88     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
  89     {
  90         std::vector<Value*> valIndices;
  91         for (auto i : indices)
  92             valIndices.push_back(C(i));
  93         return STORE(val, GEPA(basePtr, valIndices));
  94     }
  95
  96     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
  97     {
  98         std::vector<Value*> valIndices;
  99         for (auto i : indices)
 100             valIndices.push_back(i);
 101         return STORE(val, GEPA(basePtr, valIndices));
 102     }
 103
 104     //////////////////////////////////////////////////////////////////////////
 105     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 106     /// supported on the underlying platform, emulate it with float masked load
 107     /// @param src - base address pointer for the load
 108     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 109     Value *Builder::MASKLOADD(Value* src, Value* mask)
 110     {
 111         Value* vResult;
 112         // use avx2 gather instruction is available
 113         if (JM()->mArch.AVX2())
 114         {
 115             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 116             vResult = CALL(func, { src,mask });
 117         }
 118         else
 119         {
 120             // maskload intrinsic expects integer mask operand in llvm >= 3.8
 121 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
 122             mask = BITCAST(mask, VectorType::get(mInt32Ty, mVWidth));
 123 #else
 124             mask = BITCAST(mask, VectorType::get(mFP32Ty, mVWidth));
 125 #endif
 126             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256);
 127             vResult = BITCAST(CALL(func, { src,mask }), VectorType::get(mInt32Ty, mVWidth));
 128         }
 129         return vResult;
 130     }
 131
 132     //////////////////////////////////////////////////////////////////////////
 133     /// @brief Generate a masked gather operation in LLVM IR.  If not
 134     /// supported on the underlying platform, emulate it with loads
 135     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 136     /// @param pBase - Int8* base VB address pointer value
 137     /// @param vIndices - SIMD wide value of VB byte offsets
 138     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 139     /// @param scale - value to scale indices by
 140     Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 141     {
 142         Value *vGather;
 143
 144         // use avx2 gather instruction if available
 145         if (JM()->mArch.AVX2())
 146         {
 147             // force mask to <N x float>, required by vgather
 148             Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
 149
 150             vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
 151         }
 152         else
 153         {
 154             Value* pStack = STACKSAVE();
 155
 156             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 157             Value* vSrcPtr = ALLOCA(vSrc->getType());
 158             STORE(vSrc, vSrcPtr);
 159
 160             vGather = VUNDEF_F();
 161             Value *vScaleVec = VIMMED1((uint32_t)scale);
 162             Value *vOffsets = MUL(vIndices, vScaleVec);
 163             for (uint32_t i = 0; i < mVWidth; ++i)
 164             {
 165                 // single component byte index
 166                 Value *offset = VEXTRACT(vOffsets, C(i));
 167                 // byte pointer to component
 168                 Value *loadAddress = GEP(pBase, offset);
 169                 loadAddress = BITCAST(loadAddress, PointerType::get(mFP32Ty, 0));
 170                 // pointer to the value to load if we're masking off a component
 171                 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
 172                 Value *selMask = VEXTRACT(vMask, C(i));
 173                 // switch in a safe address to load if we're trying to access a vertex
 174                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 175                 Value *val = LOAD(validAddress);
 176                 vGather = VINSERT(vGather, val, C(i));
 177             }
 178
 179             STACKRESTORE(pStack);
 180         }
 181
 182         return vGather;
 183     }
 184
 185     Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 186     {
 187         Value *vGather = VUNDEF_F_16();
 188
 189         // use AVX512F gather instruction if available
 190         if (JM()->mArch.AVX512F())
 191         {
 192             // force mask to <N-bit Integer>, required by vgather2
 193             Value *mask = BITCAST(vMask, mInt16Ty);
 194
 195             vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
 196         }
 197         else
 198         {
 199             Value *src0 = EXTRACT_16(vSrc, 0);
 200             Value *src1 = EXTRACT_16(vSrc, 1);
 201
 202             Value *indices0 = EXTRACT_16(vIndices, 0);
 203             Value *indices1 = EXTRACT_16(vIndices, 1);
 204
 205             Value *mask0 = EXTRACT_16(vMask, 0);
 206             Value *mask1 = EXTRACT_16(vMask, 1);
 207
 208             Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
 209             Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
 210
 211             vGather = JOIN_16(gather0, gather1);
 212         }
 213
 214         return vGather;
 215     }
 216
 217     //////////////////////////////////////////////////////////////////////////
 218     /// @brief Generate a masked gather operation in LLVM IR.  If not
 219     /// supported on the underlying platform, emulate it with loads
 220     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 221     /// @param pBase - Int8* base VB address pointer value
 222     /// @param vIndices - SIMD wide value of VB byte offsets
 223     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 224     /// @param scale - value to scale indices by
 225     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 226     {
 227         Value* vGather;
 228
 229         // use avx2 gather instruction if available
 230         if (JM()->mArch.AVX2())
 231         {
 232             vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
 233         }
 234         else
 235         {
 236             Value* pStack = STACKSAVE();
 237
 238             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 239             Value* vSrcPtr = ALLOCA(vSrc->getType());
 240             STORE(vSrc, vSrcPtr);
 241
 242             vGather = VUNDEF_I();
 243             Value *vScaleVec = VIMMED1((uint32_t)scale);
 244             Value *vOffsets = MUL(vIndices, vScaleVec);
 245             for (uint32_t i = 0; i < mVWidth; ++i)
 246             {
 247                 // single component byte index
 248                 Value *offset = VEXTRACT(vOffsets, C(i));
 249                 // byte pointer to component
 250                 Value *loadAddress = GEP(pBase, offset);
 251                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 252                 // pointer to the value to load if we're masking off a component
 253                 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
 254                 Value *selMask = VEXTRACT(vMask, C(i));
 255                 // switch in a safe address to load if we're trying to access a vertex
 256                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 257                 Value *val = LOAD(validAddress, C(0));
 258                 vGather = VINSERT(vGather, val, C(i));
 259             }
 260
 261             STACKRESTORE(pStack);
 262         }
 263
 264         return vGather;
 265     }
 266
 267     Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 268     {
 269         Value *vGather = VUNDEF_I_16();
 270
 271         // use AVX512F gather instruction if available
 272         if (JM()->mArch.AVX512F())
 273         {
 274             // force mask to <N-bit Integer>, required by vgather2
 275             Value *mask = BITCAST(vMask, mInt16Ty);
 276
 277             vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
 278         }
 279         else
 280         {
 281             Value *src0 = EXTRACT_16(vSrc, 0);
 282             Value *src1 = EXTRACT_16(vSrc, 1);
 283
 284             Value *indices0 = EXTRACT_16(vIndices, 0);
 285             Value *indices1 = EXTRACT_16(vIndices, 1);
 286
 287             Value *mask0 = EXTRACT_16(vMask, 0);
 288             Value *mask1 = EXTRACT_16(vMask, 1);
 289
 290             Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
 291             Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
 292
 293             vGather = JOIN_16(gather0, gather1);
 294         }
 295
 296         return vGather;
 297     }
 298
 299     //////////////////////////////////////////////////////////////////////////
 300     /// @brief Generate a masked gather operation in LLVM IR.  If not
 301     /// supported on the underlying platform, emulate it with loads
 302     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 303     /// @param pBase - Int8* base VB address pointer value
 304     /// @param vIndices - SIMD wide value of VB byte offsets
 305     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 306     /// @param scale - value to scale indices by
 307     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 308     {
 309         Value* vGather;
 310
 311         // use avx2 gather instruction if available
 312         if (JM()->mArch.AVX2())
 313         {
 314             vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
 315             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 316         }
 317         else
 318         {
 319             Value* pStack = STACKSAVE();
 320
 321             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 322             Value* vSrcPtr = ALLOCA(vSrc->getType());
 323             STORE(vSrc, vSrcPtr);
 324
 325             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
 326             Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
 327             Value *vOffsets = MUL(vIndices, vScaleVec);
 328             for (uint32_t i = 0; i < mVWidth / 2; ++i)
 329             {
 330                 // single component byte index
 331                 Value *offset = VEXTRACT(vOffsets, C(i));
 332                 // byte pointer to component
 333                 Value *loadAddress = GEP(pBase, offset);
 334                 loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
 335                 // pointer to the value to load if we're masking off a component
 336                 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
 337                 Value *selMask = VEXTRACT(vMask, C(i));
 338                 // switch in a safe address to load if we're trying to access a vertex
 339                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 340                 Value *val = LOAD(validAddress);
 341                 vGather = VINSERT(vGather, val, C(i));
 342             }
 343             STACKRESTORE(pStack);
 344         }
 345         return vGather;
 346     }
 347
 348     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 349         Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 350     {
 351         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 352         if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 353         {
 354             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 355         }
 356         else
 357         {
 358             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 359         }
 360     }
 361
 362     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 363         Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
 364     {
 365         switch (info.bpp / info.numComps)
 366         {
 367         case 16:
 368         {
 369             Value* vGatherResult[2];
 370
 371             // TODO: vGatherMaskedVal
 372             Value* vGatherMaskedVal = VIMMED1((float)0);
 373
 374             // always have at least one component out of x or y to fetch
 375
 376             vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
 377             // e.g. result of first 8x32bit integer gather for 16bit components
 378             // 256i - 0    1    2    3    4    5    6    7
 379             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 380             //
 381
 382             // if we have at least one component out of x or y to fetch
 383             if (info.numComps > 2)
 384             {
 385                 // offset base to the next components(zw) in the vertex to gather
 386                 pSrcBase = GEP(pSrcBase, C((char)4));
 387
 388                 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
 389                 // e.g. result of second 8x32bit integer gather for 16bit components
 390                 // 256i - 0    1    2    3    4    5    6    7
 391                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 392                 //
 393             }
 394             else
 395             {
 396                 vGatherResult[1] = vGatherMaskedVal;
 397             }
 398
 399             // Shuffle gathered components into place, each row is a component
 400             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 401         }
 402         break;
 403         case 32:
 404         {
 405             // apply defaults
 406             for (uint32_t i = 0; i < 4; ++i)
 407             {
 408                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
 409             }
 410
 411             for (uint32_t i = 0; i < info.numComps; i++)
 412             {
 413                 uint32_t swizzleIndex = info.swizzle[i];
 414
 415                 // Gather a SIMD of components
 416                 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
 417
 418                 // offset base to the next component to gather
 419                 pSrcBase = GEP(pSrcBase, C((char)4));
 420             }
 421         }
 422         break;
 423         default:
 424             SWR_INVALID("Invalid float format");
 425             break;
 426         }
 427     }
 428
 429     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 430         Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
 431     {
 432         switch (info.bpp / info.numComps)
 433         {
 434         case 8:
 435         {
 436             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 437             Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
 438             // e.g. result of an 8x32bit integer gather for 8bit components
 439             // 256i - 0    1    2    3    4    5    6    7
 440             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 441
 442             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 443         }
 444         break;
 445         case 16:
 446         {
 447             Value* vGatherResult[2];
 448
 449             // TODO: vGatherMaskedVal
 450             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 451
 452             // always have at least one component out of x or y to fetch
 453
 454             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
 455             // e.g. result of first 8x32bit integer gather for 16bit components
 456             // 256i - 0    1    2    3    4    5    6    7
 457             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 458             //
 459
 460             // if we have at least one component out of x or y to fetch
 461             if (info.numComps > 2)
 462             {
 463                 // offset base to the next components(zw) in the vertex to gather
 464                 pSrcBase = GEP(pSrcBase, C((char)4));
 465
 466                 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
 467                 // e.g. result of second 8x32bit integer gather for 16bit components
 468                 // 256i - 0    1    2    3    4    5    6    7
 469                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 470                 //
 471             }
 472             else
 473             {
 474                 vGatherResult[1] = vGatherMaskedVal;
 475             }
 476
 477             // Shuffle gathered components into place, each row is a component
 478             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 479
 480         }
 481         break;
 482         case 32:
 483         {
 484             // apply defaults
 485             for (uint32_t i = 0; i < 4; ++i)
 486             {
 487                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
 488             }
 489
 490             for (uint32_t i = 0; i < info.numComps; i++)
 491             {
 492                 uint32_t swizzleIndex = info.swizzle[i];
 493
 494                 // Gather a SIMD of components
 495                 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
 496
 497                 // offset base to the next component to gather
 498                 pSrcBase = GEP(pSrcBase, C((char)4));
 499             }
 500         }
 501         break;
 502         default:
 503             SWR_INVALID("unsupported format");
 504             break;
 505         }
 506     }
 507
 508     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
 509     {
 510         // cast types
 511         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 512         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 513
 514                                                                // input could either be float or int vector; do shuffle work in int
 515         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
 516         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
 517
 518         if (bPackedOutput)
 519         {
 520             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 521
 522                                                                                                          // shuffle mask
 523             Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 524                 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
 525             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
 526             // after pshufb: group components together in each 128bit lane
 527             // 256i - 0    1    2    3    4    5    6    7
 528             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 529
 530             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
 531             // after PERMD: move and pack xy components into each 128bit lane
 532             // 256i - 0    1    2    3    4    5    6    7
 533             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
 534
 535             // do the same for zw components
 536             Value* vi128ZW = nullptr;
 537             if (info.numComps > 2)
 538             {
 539                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
 540                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
 541             }
 542
 543             for (uint32_t i = 0; i < 4; i++)
 544             {
 545                 uint32_t swizzleIndex = info.swizzle[i];
 546                 // todo: fixed for packed
 547                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 548                 if (i >= info.numComps)
 549                 {
 550                     // set the default component val
 551                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 552                     continue;
 553                 }
 554
 555                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 556                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 557                 // if x or y, use vi128XY permute result, else use vi128ZW
 558                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 559
 560                 // extract packed component 128 bit lanes
 561                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 562             }
 563
 564         }
 565         else
 566         {
 567             // pshufb masks for each component
 568             Value* vConstMask[2];
 569             // x/z shuffle mask
 570             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
 571                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
 572
 573             // y/w shuffle mask
 574             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
 575                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
 576
 577
 578             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
 579             // apply defaults
 580             for (uint32_t i = 0; i < 4; ++i)
 581             {
 582                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 583             }
 584
 585             for (uint32_t i = 0; i < info.numComps; i++)
 586             {
 587                 uint32_t swizzleIndex = info.swizzle[i];
 588
 589                 // select correct constMask for x/z or y/w pshufb
 590                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
 591                 // if x or y, use vi128XY permute result, else use vi128ZW
 592                 uint32_t selectedGather = (i < 2) ? 0 : 1;
 593
 594                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
 595                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
 596                 // 256i - 0    1    2    3    4    5    6    7
 597                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
 598             }
 599         }
 600     }
 601
 602     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
 603     {
 604         // cast types
 605         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 606         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 607
 608         if (bPackedOutput)
 609         {
 610             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 611                                                                                                       // shuffle mask
 612             Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 613                 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
 614             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 615             // after pshufb: group components together in each 128bit lane
 616             // 256i - 0    1    2    3    4    5    6    7
 617             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 618
 619             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
 620             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
 621             // 256i - 0    1    2    3    4    5    6    7
 622             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
 623
 624             // do the same for zw components
 625             Value* vi128ZW = nullptr;
 626             if (info.numComps > 2)
 627             {
 628                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
 629             }
 630
 631             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
 632             for (uint32_t i = 0; i < 4; i++)
 633             {
 634                 uint32_t swizzleIndex = info.swizzle[i];
 635                 // todo: fix for packed
 636                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 637                 if (i >= info.numComps)
 638                 {
 639                     // set the default component val
 640                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 641                     continue;
 642                 }
 643
 644                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 645                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 646                 // if x or y, use vi128XY permute result, else use vi128ZW
 647                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 648
 649                 // sign extend
 650                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 651             }
 652         }
 653         // else zero extend
 654         else {
 655             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
 656             // apply defaults
 657             for (uint32_t i = 0; i < 4; ++i)
 658             {
 659                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 660             }
 661
 662             for (uint32_t i = 0; i < info.numComps; i++) {
 663                 uint32_t swizzleIndex = info.swizzle[i];
 664
 665                 // pshufb masks for each component
 666                 Value* vConstMask;
 667                 switch (i)
 668                 {
 669                 case 0:
 670                     // x shuffle mask
 671                     vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
 672                         0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
 673                     break;
 674                 case 1:
 675                     // y shuffle mask
 676                     vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
 677                         1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
 678                     break;
 679                 case 2:
 680                     // z shuffle mask
 681                     vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
 682                         2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
 683                     break;
 684                 case 3:
 685                     // w shuffle mask
 686                     vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
 687                         3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
 688                     break;
 689                 default:
 690                     vConstMask = nullptr;
 691                     break;
 692                 }
 693
 694                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 695                 // after pshufb for x channel
 696                 // 256i - 0    1    2    3    4    5    6    7
 697                 //        x000 x000 x000 x000 x000 x000 x000 x000
 698             }
 699         }
 700     }
 701
 702     //////////////////////////////////////////////////////////////////////////
 703     /// @brief emulates a scatter operation.
 704     /// @param pDst - pointer to destination
 705     /// @param vSrc - vector of src data to scatter
 706     /// @param vOffsets - vector of byte offsets from pDst
 707     /// @param vMask - mask of valid lanes
 708     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
 709     {
 710         /* Scatter algorithm
 711
 712         while(Index = BitScanForward(mask))
 713         srcElem = srcVector[Index]
 714         offsetElem = offsetVector[Index]
 715         *(pDst + offsetElem) = srcElem
 716         Update mask (&= ~(1<<Index)
 717
 718         */
 719
 720         BasicBlock* pCurBB = IRB()->GetInsertBlock();
 721         Function* pFunc = pCurBB->getParent();
 722         Type* pSrcTy = vSrc->getType()->getVectorElementType();
 723
 724         // Store vectors on stack
 725         if (pScatterStackSrc == nullptr)
 726         {
 727             // Save off stack allocations and reuse per scatter. Significantly reduces stack
 728             // requirements for shaders with a lot of scatters.
 729             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
 730             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
 731         }
 732
 733         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
 734         Value* pOffsetsArrayPtr = pScatterStackOffsets;
 735         STORE(vSrc, pSrcArrayPtr);
 736         STORE(vOffsets, pOffsetsArrayPtr);
 737
 738         // Cast to pointers for random access
 739         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
 740         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
 741
 742         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
 743
 744         // Get cttz function
 745         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
 746
 747         // Setup loop basic block
 748         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
 749
 750         // compute first set bit
 751         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
 752
 753         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
 754
 755         // Split current block
 756         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
 757
 758         // Remove unconditional jump created by splitBasicBlock
 759         pCurBB->getTerminator()->eraseFromParent();
 760
 761         // Add terminator to end of original block
 762         IRB()->SetInsertPoint(pCurBB);
 763
 764         // Add conditional branch
 765         COND_BR(pIsUndef, pPostLoop, pLoop);
 766
 767         // Add loop basic block contents
 768         IRB()->SetInsertPoint(pLoop);
 769         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
 770         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
 771
 772         pIndexPhi->addIncoming(pIndex, pCurBB);
 773         pMaskPhi->addIncoming(pMask, pCurBB);
 774
 775         // Extract elements for this index
 776         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
 777         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
 778
 779         // GEP to this offset in dst
 780         Value* pCurDst = GEP(pDst, pOffsetElem);
 781         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
 782         STORE(pSrcElem, pCurDst);
 783
 784         // Update the mask
 785         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
 786
 787         // Terminator
 788         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
 789
 790         pIsUndef = ICMP_EQ(pNewIndex, C(32));
 791         COND_BR(pIsUndef, pPostLoop, pLoop);
 792
 793         // Update phi edges
 794         pIndexPhi->addIncoming(pNewIndex, pLoop);
 795         pMaskPhi->addIncoming(pNewMask, pLoop);
 796
 797         // Move builder to beginning of post loop
 798         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
 799     }
 800
 801     //////////////////////////////////////////////////////////////////////////
 802     /// @brief save/restore stack, providing ability to push/pop the stack and
 803     ///        reduce overall stack requirements for temporary stack use
 804     Value* Builder::STACKSAVE()
 805     {
 806         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
 807         return CALLA(pfnStackSave);
 808     }
 809
 810     void Builder::STACKRESTORE(Value* pSaved)
 811     {
 812         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
 813         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
 814     }
 815
 816 }