src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32 #include "common/rdtsc_buckets.h"
  33
  34 #include <cstdarg>
  35
  36
  37 namespace SwrJit
  38 {
  39
  40     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
  41     {
  42         std::vector<Value*> indices;
  43         for (auto i : indexList)
  44             indices.push_back(i);
  45         return GEPA(ptr, indices);
  46     }
  47
  48     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
  49     {
  50         std::vector<Value*> indices;
  51         for (auto i : indexList)
  52             indices.push_back(C(i));
  53         return GEPA(ptr, indices);
  54     }
  55
  56     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
  57     {
  58         std::vector<Value*> indices;
  59         for (auto i : indexList)
  60             indices.push_back(i);
  61         return IN_BOUNDS_GEP(ptr, indices);
  62     }
  63
  64     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
  65     {
  66         std::vector<Value*> indices;
  67         for (auto i : indexList)
  68             indices.push_back(C(i));
  69         return IN_BOUNDS_GEP(ptr, indices);
  70     }
  71
  72     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
  73     {
  74         std::vector<Value*> valIndices;
  75         for (auto i : indices)
  76             valIndices.push_back(C(i));
  77         return LOAD(GEPA(basePtr, valIndices), name);
  78     }
  79
  80     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
  81     {
  82         std::vector<Value*> valIndices;
  83         for (auto i : indices)
  84             valIndices.push_back(i);
  85         return LOAD(GEPA(basePtr, valIndices), name);
  86     }
  87
  88     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
  89     {
  90         std::vector<Value*> valIndices;
  91         for (auto i : indices)
  92             valIndices.push_back(C(i));
  93         return STORE(val, GEPA(basePtr, valIndices));
  94     }
  95
  96     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
  97     {
  98         std::vector<Value*> valIndices;
  99         for (auto i : indices)
 100             valIndices.push_back(i);
 101         return STORE(val, GEPA(basePtr, valIndices));
 102     }
 103
 104     //////////////////////////////////////////////////////////////////////////
 105     /// @brief Generate an i32 masked load operation in LLVM IR.  If not
 106     /// supported on the underlying platform, emulate it with float masked load
 107     /// @param src - base address pointer for the load
 108     /// @param vMask - SIMD wide mask that controls whether to access memory load 0
 109     Value *Builder::MASKLOADD(Value* src, Value* mask)
 110     {
 111         Value* vResult;
 112         // use avx2 gather instruction is available
 113         if (JM()->mArch.AVX2())
 114         {
 115             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
 116             vResult = CALL(func, { src,mask });
 117         }
 118         else
 119         {
 120             // maskload intrinsic expects integer mask operand in llvm >= 3.8
 121 #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
 122             mask = BITCAST(mask, VectorType::get(mInt32Ty, mVWidth));
 123 #else
 124             mask = BITCAST(mask, VectorType::get(mFP32Ty, mVWidth));
 125 #endif
 126             Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256);
 127             vResult = BITCAST(CALL(func, { src,mask }), VectorType::get(mInt32Ty, mVWidth));
 128         }
 129         return vResult;
 130     }
 131
 132     //////////////////////////////////////////////////////////////////////////
 133     /// @brief Generate a masked gather operation in LLVM IR.  If not
 134     /// supported on the underlying platform, emulate it with loads
 135     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 136     /// @param pBase - Int8* base VB address pointer value
 137     /// @param vIndices - SIMD wide value of VB byte offsets
 138     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 139     /// @param scale - value to scale indices by
 140     Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 141     {
 142         Value *vGather;
 143         Value *pBasePtr = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
 144
 145         // use avx2 gather instruction if available
 146         if (JM()->mArch.AVX2())
 147         {
 148             // force mask to <N x float>, required by vgather
 149             Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
 150
 151             vGather = VGATHERPS(vSrc, pBasePtr, vIndices, mask, C(scale));
 152         }
 153         else
 154         {
 155             Value* pStack = STACKSAVE();
 156
 157             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 158             Value* vSrcPtr = ALLOCA(vSrc->getType());
 159             STORE(vSrc, vSrcPtr);
 160
 161             vGather = VUNDEF_F();
 162             Value *vScaleVec = VIMMED1((uint32_t)scale);
 163             Value *vOffsets = MUL(vIndices, vScaleVec);
 164             for (uint32_t i = 0; i < mVWidth; ++i)
 165             {
 166                 // single component byte index
 167                 Value *offset = VEXTRACT(vOffsets, C(i));
 168                 // byte pointer to component
 169                 Value *loadAddress = GEP(pBasePtr, offset);
 170                 loadAddress = BITCAST(loadAddress, PointerType::get(mFP32Ty, 0));
 171                 // pointer to the value to load if we're masking off a component
 172                 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
 173                 Value *selMask = VEXTRACT(vMask, C(i));
 174                 // switch in a safe address to load if we're trying to access a vertex
 175                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 176                 Value *val = LOAD(validAddress);
 177                 vGather = VINSERT(vGather, val, C(i));
 178             }
 179
 180             STACKRESTORE(pStack);
 181         }
 182
 183         return vGather;
 184     }
 185
 186     Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 187     {
 188         Value *vGather = VUNDEF_F_16();
 189
 190         // use AVX512F gather instruction if available
 191         if (JM()->mArch.AVX512F())
 192         {
 193             // force mask to <N-bit Integer>, required by vgather2
 194             Value *mask = BITCAST(vMask, mInt16Ty);
 195
 196             vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
 197         }
 198         else
 199         {
 200             Value *src0 = EXTRACT_16(vSrc, 0);
 201             Value *src1 = EXTRACT_16(vSrc, 1);
 202
 203             Value *indices0 = EXTRACT_16(vIndices, 0);
 204             Value *indices1 = EXTRACT_16(vIndices, 1);
 205
 206             Value *mask0 = EXTRACT_16(vMask, 0);
 207             Value *mask1 = EXTRACT_16(vMask, 1);
 208
 209             Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
 210             Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
 211
 212             vGather = JOIN_16(gather0, gather1);
 213         }
 214
 215         return vGather;
 216     }
 217
 218     //////////////////////////////////////////////////////////////////////////
 219     /// @brief Generate a masked gather operation in LLVM IR.  If not
 220     /// supported on the underlying platform, emulate it with loads
 221     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 222     /// @param pBase - Int8* base VB address pointer value
 223     /// @param vIndices - SIMD wide value of VB byte offsets
 224     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 225     /// @param scale - value to scale indices by
 226     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 227     {
 228         Value* vGather;
 229
 230         // use avx2 gather instruction if available
 231         if (JM()->mArch.AVX2())
 232         {
 233             vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
 234         }
 235         else
 236         {
 237             Value* pStack = STACKSAVE();
 238
 239             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 240             Value* vSrcPtr = ALLOCA(vSrc->getType());
 241             STORE(vSrc, vSrcPtr);
 242
 243             vGather = VUNDEF_I();
 244             Value *vScaleVec = VIMMED1((uint32_t)scale);
 245             Value *vOffsets = MUL(vIndices, vScaleVec);
 246             for (uint32_t i = 0; i < mVWidth; ++i)
 247             {
 248                 // single component byte index
 249                 Value *offset = VEXTRACT(vOffsets, C(i));
 250                 // byte pointer to component
 251                 Value *loadAddress = GEP(pBase, offset);
 252                 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
 253                 // pointer to the value to load if we're masking off a component
 254                 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
 255                 Value *selMask = VEXTRACT(vMask, C(i));
 256                 // switch in a safe address to load if we're trying to access a vertex
 257                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 258                 Value *val = LOAD(validAddress, C(0));
 259                 vGather = VINSERT(vGather, val, C(i));
 260             }
 261
 262             STACKRESTORE(pStack);
 263         }
 264
 265         return vGather;
 266     }
 267
 268     Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
 269     {
 270         Value *vGather = VUNDEF_I_16();
 271
 272         // use AVX512F gather instruction if available
 273         if (JM()->mArch.AVX512F())
 274         {
 275             // force mask to <N-bit Integer>, required by vgather2
 276             Value *mask = BITCAST(vMask, mInt16Ty);
 277
 278             vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
 279         }
 280         else
 281         {
 282             Value *src0 = EXTRACT_16(vSrc, 0);
 283             Value *src1 = EXTRACT_16(vSrc, 1);
 284
 285             Value *indices0 = EXTRACT_16(vIndices, 0);
 286             Value *indices1 = EXTRACT_16(vIndices, 1);
 287
 288             Value *mask0 = EXTRACT_16(vMask, 0);
 289             Value *mask1 = EXTRACT_16(vMask, 1);
 290
 291             Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
 292             Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
 293
 294             vGather = JOIN_16(gather0, gather1);
 295         }
 296
 297         return vGather;
 298     }
 299
 300     //////////////////////////////////////////////////////////////////////////
 301     /// @brief Generate a masked gather operation in LLVM IR.  If not
 302     /// supported on the underlying platform, emulate it with loads
 303     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 304     /// @param pBase - Int8* base VB address pointer value
 305     /// @param vIndices - SIMD wide value of VB byte offsets
 306     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 307     /// @param scale - value to scale indices by
 308     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 309     {
 310         Value* vGather;
 311
 312         // use avx2 gather instruction if available
 313         if (JM()->mArch.AVX2())
 314         {
 315             vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
 316             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 317         }
 318         else
 319         {
 320             Value* pStack = STACKSAVE();
 321
 322             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 323             Value* vSrcPtr = ALLOCA(vSrc->getType());
 324             STORE(vSrc, vSrcPtr);
 325
 326             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
 327             Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
 328             Value *vOffsets = MUL(vIndices, vScaleVec);
 329             for (uint32_t i = 0; i < mVWidth / 2; ++i)
 330             {
 331                 // single component byte index
 332                 Value *offset = VEXTRACT(vOffsets, C(i));
 333                 // byte pointer to component
 334                 Value *loadAddress = GEP(pBase, offset);
 335                 loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
 336                 // pointer to the value to load if we're masking off a component
 337                 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
 338                 Value *selMask = VEXTRACT(vMask, C(i));
 339                 // switch in a safe address to load if we're trying to access a vertex
 340                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 341                 Value *val = LOAD(validAddress);
 342                 vGather = VINSERT(vGather, val, C(i));
 343             }
 344             STACKRESTORE(pStack);
 345         }
 346         return vGather;
 347     }
 348
 349     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 350         Value* mask, Value* vGatherComponents[], bool bPackedOutput)
 351     {
 352         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 353         if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 354         {
 355             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 356         }
 357         else
 358         {
 359             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
 360         }
 361     }
 362
 363     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 364         Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
 365     {
 366         switch (info.bpp / info.numComps)
 367         {
 368         case 16:
 369         {
 370             Value* vGatherResult[2];
 371
 372             // TODO: vGatherMaskedVal
 373             Value* vGatherMaskedVal = VIMMED1((float)0);
 374
 375             // always have at least one component out of x or y to fetch
 376
 377             vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
 378             // e.g. result of first 8x32bit integer gather for 16bit components
 379             // 256i - 0    1    2    3    4    5    6    7
 380             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 381             //
 382
 383             // if we have at least one component out of x or y to fetch
 384             if (info.numComps > 2)
 385             {
 386                 // offset base to the next components(zw) in the vertex to gather
 387                 pSrcBase = GEP(pSrcBase, C((char)4));
 388
 389                 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
 390                 // e.g. result of second 8x32bit integer gather for 16bit components
 391                 // 256i - 0    1    2    3    4    5    6    7
 392                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 393                 //
 394             }
 395             else
 396             {
 397                 vGatherResult[1] = vGatherMaskedVal;
 398             }
 399
 400             // Shuffle gathered components into place, each row is a component
 401             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 402         }
 403         break;
 404         case 32:
 405         {
 406             // apply defaults
 407             for (uint32_t i = 0; i < 4; ++i)
 408             {
 409                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
 410             }
 411
 412             for (uint32_t i = 0; i < info.numComps; i++)
 413             {
 414                 uint32_t swizzleIndex = info.swizzle[i];
 415
 416                 // Gather a SIMD of components
 417                 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
 418
 419                 // offset base to the next component to gather
 420                 pSrcBase = GEP(pSrcBase, C((char)4));
 421             }
 422         }
 423         break;
 424         default:
 425             SWR_INVALID("Invalid float format");
 426             break;
 427         }
 428     }
 429
 430     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 431         Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
 432     {
 433         switch (info.bpp / info.numComps)
 434         {
 435         case 8:
 436         {
 437             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 438             Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
 439             // e.g. result of an 8x32bit integer gather for 8bit components
 440             // 256i - 0    1    2    3    4    5    6    7
 441             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 442
 443             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 444         }
 445         break;
 446         case 16:
 447         {
 448             Value* vGatherResult[2];
 449
 450             // TODO: vGatherMaskedVal
 451             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 452
 453             // always have at least one component out of x or y to fetch
 454
 455             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
 456             // e.g. result of first 8x32bit integer gather for 16bit components
 457             // 256i - 0    1    2    3    4    5    6    7
 458             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 459             //
 460
 461             // if we have at least one component out of x or y to fetch
 462             if (info.numComps > 2)
 463             {
 464                 // offset base to the next components(zw) in the vertex to gather
 465                 pSrcBase = GEP(pSrcBase, C((char)4));
 466
 467                 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
 468                 // e.g. result of second 8x32bit integer gather for 16bit components
 469                 // 256i - 0    1    2    3    4    5    6    7
 470                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 471                 //
 472             }
 473             else
 474             {
 475                 vGatherResult[1] = vGatherMaskedVal;
 476             }
 477
 478             // Shuffle gathered components into place, each row is a component
 479             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 480
 481         }
 482         break;
 483         case 32:
 484         {
 485             // apply defaults
 486             for (uint32_t i = 0; i < 4; ++i)
 487             {
 488                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
 489             }
 490
 491             for (uint32_t i = 0; i < info.numComps; i++)
 492             {
 493                 uint32_t swizzleIndex = info.swizzle[i];
 494
 495                 // Gather a SIMD of components
 496                 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
 497
 498                 // offset base to the next component to gather
 499                 pSrcBase = GEP(pSrcBase, C((char)4));
 500             }
 501         }
 502         break;
 503         default:
 504             SWR_INVALID("unsupported format");
 505             break;
 506         }
 507     }
 508
 509     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
 510     {
 511         // cast types
 512         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 513         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 514
 515                                                                // input could either be float or int vector; do shuffle work in int
 516         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
 517         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
 518
 519         if (bPackedOutput)
 520         {
 521             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 522
 523                                                                                                          // shuffle mask
 524             Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 525                 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
 526             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
 527             // after pshufb: group components together in each 128bit lane
 528             // 256i - 0    1    2    3    4    5    6    7
 529             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 530
 531             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
 532             // after PERMD: move and pack xy components into each 128bit lane
 533             // 256i - 0    1    2    3    4    5    6    7
 534             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
 535
 536             // do the same for zw components
 537             Value* vi128ZW = nullptr;
 538             if (info.numComps > 2)
 539             {
 540                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
 541                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
 542             }
 543
 544             for (uint32_t i = 0; i < 4; i++)
 545             {
 546                 uint32_t swizzleIndex = info.swizzle[i];
 547                 // todo: fixed for packed
 548                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 549                 if (i >= info.numComps)
 550                 {
 551                     // set the default component val
 552                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 553                     continue;
 554                 }
 555
 556                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 557                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 558                 // if x or y, use vi128XY permute result, else use vi128ZW
 559                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 560
 561                 // extract packed component 128 bit lanes
 562                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 563             }
 564
 565         }
 566         else
 567         {
 568             // pshufb masks for each component
 569             Value* vConstMask[2];
 570             // x/z shuffle mask
 571             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
 572                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
 573
 574             // y/w shuffle mask
 575             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
 576                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
 577
 578
 579             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
 580             // apply defaults
 581             for (uint32_t i = 0; i < 4; ++i)
 582             {
 583                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 584             }
 585
 586             for (uint32_t i = 0; i < info.numComps; i++)
 587             {
 588                 uint32_t swizzleIndex = info.swizzle[i];
 589
 590                 // select correct constMask for x/z or y/w pshufb
 591                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
 592                 // if x or y, use vi128XY permute result, else use vi128ZW
 593                 uint32_t selectedGather = (i < 2) ? 0 : 1;
 594
 595                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
 596                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
 597                 // 256i - 0    1    2    3    4    5    6    7
 598                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
 599             }
 600         }
 601     }
 602
 603     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
 604     {
 605         // cast types
 606         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 607         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 608
 609         if (bPackedOutput)
 610         {
 611             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 612                                                                                                       // shuffle mask
 613             Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 614                 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
 615             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 616             // after pshufb: group components together in each 128bit lane
 617             // 256i - 0    1    2    3    4    5    6    7
 618             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 619
 620             Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
 621             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
 622             // 256i - 0    1    2    3    4    5    6    7
 623             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
 624
 625             // do the same for zw components
 626             Value* vi128ZW = nullptr;
 627             if (info.numComps > 2)
 628             {
 629                 vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
 630             }
 631
 632             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
 633             for (uint32_t i = 0; i < 4; i++)
 634             {
 635                 uint32_t swizzleIndex = info.swizzle[i];
 636                 // todo: fix for packed
 637                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 638                 if (i >= info.numComps)
 639                 {
 640                     // set the default component val
 641                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 642                     continue;
 643                 }
 644
 645                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 646                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 647                 // if x or y, use vi128XY permute result, else use vi128ZW
 648                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 649
 650                 // sign extend
 651                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 652             }
 653         }
 654         // else zero extend
 655         else {
 656             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
 657             // apply defaults
 658             for (uint32_t i = 0; i < 4; ++i)
 659             {
 660                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 661             }
 662
 663             for (uint32_t i = 0; i < info.numComps; i++) {
 664                 uint32_t swizzleIndex = info.swizzle[i];
 665
 666                 // pshufb masks for each component
 667                 Value* vConstMask;
 668                 switch (i)
 669                 {
 670                 case 0:
 671                     // x shuffle mask
 672                     vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
 673                         0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
 674                     break;
 675                 case 1:
 676                     // y shuffle mask
 677                     vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
 678                         1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
 679                     break;
 680                 case 2:
 681                     // z shuffle mask
 682                     vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
 683                         2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
 684                     break;
 685                 case 3:
 686                     // w shuffle mask
 687                     vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
 688                         3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
 689                     break;
 690                 default:
 691                     vConstMask = nullptr;
 692                     break;
 693                 }
 694
 695                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 696                 // after pshufb for x channel
 697                 // 256i - 0    1    2    3    4    5    6    7
 698                 //        x000 x000 x000 x000 x000 x000 x000 x000
 699             }
 700         }
 701     }
 702
 703     //////////////////////////////////////////////////////////////////////////
 704     /// @brief emulates a scatter operation.
 705     /// @param pDst - pointer to destination
 706     /// @param vSrc - vector of src data to scatter
 707     /// @param vOffsets - vector of byte offsets from pDst
 708     /// @param vMask - mask of valid lanes
 709     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
 710     {
 711         /* Scatter algorithm
 712
 713         while(Index = BitScanForward(mask))
 714         srcElem = srcVector[Index]
 715         offsetElem = offsetVector[Index]
 716         *(pDst + offsetElem) = srcElem
 717         Update mask (&= ~(1<<Index)
 718
 719         */
 720
 721         BasicBlock* pCurBB = IRB()->GetInsertBlock();
 722         Function* pFunc = pCurBB->getParent();
 723         Type* pSrcTy = vSrc->getType()->getVectorElementType();
 724
 725         // Store vectors on stack
 726         if (pScatterStackSrc == nullptr)
 727         {
 728             // Save off stack allocations and reuse per scatter. Significantly reduces stack
 729             // requirements for shaders with a lot of scatters.
 730             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
 731             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
 732         }
 733
 734         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
 735         Value* pOffsetsArrayPtr = pScatterStackOffsets;
 736         STORE(vSrc, pSrcArrayPtr);
 737         STORE(vOffsets, pOffsetsArrayPtr);
 738
 739         // Cast to pointers for random access
 740         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
 741         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
 742
 743         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
 744
 745         // Get cttz function
 746         Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
 747
 748         // Setup loop basic block
 749         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
 750
 751         // compute first set bit
 752         Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
 753
 754         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
 755
 756         // Split current block
 757         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
 758
 759         // Remove unconditional jump created by splitBasicBlock
 760         pCurBB->getTerminator()->eraseFromParent();
 761
 762         // Add terminator to end of original block
 763         IRB()->SetInsertPoint(pCurBB);
 764
 765         // Add conditional branch
 766         COND_BR(pIsUndef, pPostLoop, pLoop);
 767
 768         // Add loop basic block contents
 769         IRB()->SetInsertPoint(pLoop);
 770         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
 771         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
 772
 773         pIndexPhi->addIncoming(pIndex, pCurBB);
 774         pMaskPhi->addIncoming(pMask, pCurBB);
 775
 776         // Extract elements for this index
 777         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
 778         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
 779
 780         // GEP to this offset in dst
 781         Value* pCurDst = GEP(pDst, pOffsetElem);
 782         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
 783         STORE(pSrcElem, pCurDst);
 784
 785         // Update the mask
 786         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
 787
 788         // Terminator
 789         Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
 790
 791         pIsUndef = ICMP_EQ(pNewIndex, C(32));
 792         COND_BR(pIsUndef, pPostLoop, pLoop);
 793
 794         // Update phi edges
 795         pIndexPhi->addIncoming(pNewIndex, pLoop);
 796         pMaskPhi->addIncoming(pNewMask, pLoop);
 797
 798         // Move builder to beginning of post loop
 799         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
 800     }
 801
 802     //////////////////////////////////////////////////////////////////////////
 803     /// @brief save/restore stack, providing ability to push/pop the stack and
 804     ///        reduce overall stack requirements for temporary stack use
 805     Value* Builder::STACKSAVE()
 806     {
 807         Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
 808         return CALLA(pfnStackSave);
 809     }
 810
 811     void Builder::STACKRESTORE(Value* pSaved)
 812     {
 813         Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
 814         CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
 815     }
 816
 817 }