src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file builder_misc.cpp
  24  *
  25  * @brief Implementation for miscellaneous builder functions
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32 #include "common/rdtsc_buckets.h"
  33
  34 #include <cstdarg>
  35
  36 namespace SwrJit
  37 {
  38     void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
  39     {
  40         SWR_ASSERT(
  41             ptr->getType() != mInt64Ty,
  42             "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
  43     }
  44
  45     Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name)
  46     {
  47         return IRB()->CreateGEP(Ptr, Idx, Name);
  48     }
  49
  50     Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
  51     {
  52         return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
  53     }
  54
  55     Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
  56     {
  57         std::vector<Value*> indices;
  58         for (auto i : indexList)
  59             indices.push_back(i);
  60         return GEPA(ptr, indices);
  61     }
  62
  63     Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
  64     {
  65         std::vector<Value*> indices;
  66         for (auto i : indexList)
  67             indices.push_back(C(i));
  68         return GEPA(ptr, indices);
  69     }
  70
  71     Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
  72     {
  73         return IRB()->CreateGEP(Ptr, IdxList, Name);
  74     }
  75
  76     Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
  77     {
  78         return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
  79     }
  80
  81     Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
  82     {
  83         std::vector<Value*> indices;
  84         for (auto i : indexList)
  85             indices.push_back(i);
  86         return IN_BOUNDS_GEP(ptr, indices);
  87     }
  88
  89     Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
  90     {
  91         std::vector<Value*> indices;
  92         for (auto i : indexList)
  93             indices.push_back(C(i));
  94         return IN_BOUNDS_GEP(ptr, indices);
  95     }
  96
  97     LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage)
  98     {
  99         AssertMemoryUsageParams(Ptr, usage);
 100         return IRB()->CreateLoad(Ptr, Name);
 101     }
 102
 103     LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
 104     {
 105         AssertMemoryUsageParams(Ptr, usage);
 106         return IRB()->CreateLoad(Ptr, Name);
 107     }
 108
 109     LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, JIT_MEM_CLIENT usage)
 110     {
 111         AssertMemoryUsageParams(Ptr, usage);
 112         return IRB()->CreateLoad(Ty, Ptr, Name);
 113     }
 114
 115     LoadInst*
 116     Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
 117     {
 118         AssertMemoryUsageParams(Ptr, usage);
 119         return IRB()->CreateLoad(Ptr, isVolatile, Name);
 120     }
 121
 122     LoadInst* Builder::LOAD(Value*                                 basePtr,
 123                             const std::initializer_list<uint32_t>& indices,
 124                             const llvm::Twine&                     name,
 125                             Type*                                  Ty,
 126                             JIT_MEM_CLIENT                         usage)
 127     {
 128         std::vector<Value*> valIndices;
 129         for (auto i : indices)
 130             valIndices.push_back(C(i));
 131         return Builder::LOAD(GEPA(basePtr, valIndices), name);
 132     }
 133
 134     LoadInst* Builder::LOADV(Value*                               basePtr,
 135                              const std::initializer_list<Value*>& indices,
 136                              const llvm::Twine&                   name)
 137     {
 138         std::vector<Value*> valIndices;
 139         for (auto i : indices)
 140             valIndices.push_back(i);
 141         return LOAD(GEPA(basePtr, valIndices), name);
 142     }
 143
 144     StoreInst*
 145     Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices)
 146     {
 147         std::vector<Value*> valIndices;
 148         for (auto i : indices)
 149             valIndices.push_back(C(i));
 150         return STORE(val, GEPA(basePtr, valIndices));
 151     }
 152
 153     StoreInst*
 154     Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
 155     {
 156         std::vector<Value*> valIndices;
 157         for (auto i : indices)
 158             valIndices.push_back(i);
 159         return STORE(val, GEPA(basePtr, valIndices));
 160     }
 161
 162     Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
 163     {
 164         return GEP(base, offset);
 165     }
 166
 167     Value* Builder::MEM_ADD(Value*                                 i32Incr,
 168                             Value*                                 basePtr,
 169                             const std::initializer_list<uint32_t>& indices,
 170                             const llvm::Twine&                     name)
 171     {
 172         Value* i32Value  = LOAD(GEP(basePtr, indices), name);
 173         Value* i32Result = ADD(i32Value, i32Incr);
 174         return STORE(i32Result, GEP(basePtr, indices));
 175     }
 176
 177     //////////////////////////////////////////////////////////////////////////
 178     /// @brief Generate a masked gather operation in LLVM IR.  If not
 179     /// supported on the underlying platform, emulate it with loads
 180     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 181     /// @param pBase - Int8* base VB address pointer value
 182     /// @param vIndices - SIMD wide value of VB byte offsets
 183     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 184     /// @param scale - value to scale indices by
 185     Value* Builder::GATHERPS(Value*         vSrc,
 186                              Value*         pBase,
 187                              Value*         vIndices,
 188                              Value*         vMask,
 189                              uint8_t        scale,
 190                              JIT_MEM_CLIENT usage)
 191     {
 192         AssertMemoryUsageParams(pBase, usage);
 193
 194         return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
 195     }
 196
 197     //////////////////////////////////////////////////////////////////////////
 198     /// @brief Generate a masked gather operation in LLVM IR.  If not
 199     /// supported on the underlying platform, emulate it with loads
 200     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 201     /// @param pBase - Int8* base VB address pointer value
 202     /// @param vIndices - SIMD wide value of VB byte offsets
 203     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 204     /// @param scale - value to scale indices by
 205     Value* Builder::GATHERDD(Value*         vSrc,
 206                              Value*         pBase,
 207                              Value*         vIndices,
 208                              Value*         vMask,
 209                              uint8_t        scale,
 210                              JIT_MEM_CLIENT usage)
 211     {
 212         AssertMemoryUsageParams(pBase, usage);
 213
 214         return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
 215     }
 216
 217     //////////////////////////////////////////////////////////////////////////
 218     /// @brief Generate a masked gather operation in LLVM IR.  If not
 219     /// supported on the underlying platform, emulate it with loads
 220     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 221     /// @param pBase - Int8* base VB address pointer value
 222     /// @param vIndices - SIMD wide value of VB byte offsets
 223     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 224     /// @param scale - value to scale indices by
 225     Value*
 226     Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 227     {
 228         return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 229     }
 230
 231     //////////////////////////////////////////////////////////////////////////
 232     /// @brief Alternative masked gather where source is a vector of pointers
 233     /// @param pVecSrcPtr   - SIMD wide vector of pointers
 234     /// @param pVecMask     - SIMD active lanes
 235     /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
 236     Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
 237     {
 238         return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
 239     }
 240
 241     void Builder::Gather4(const SWR_FORMAT format,
 242                           Value*           pSrcBase,
 243                           Value*           byteOffsets,
 244                           Value*           mask,
 245                           Value*           vGatherComponents[],
 246                           bool             bPackedOutput,
 247                           JIT_MEM_CLIENT   usage)
 248     {
 249         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 250         if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 251         {
 252             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
 253         }
 254         else
 255         {
 256             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
 257         }
 258     }
 259
 260     void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
 261                             Value*                 pSrcBase,
 262                             Value*                 byteOffsets,
 263                             Value*                 vMask,
 264                             Value*                 vGatherComponents[],
 265                             bool                   bPackedOutput,
 266                             JIT_MEM_CLIENT         usage)
 267     {
 268         switch (info.bpp / info.numComps)
 269         {
 270         case 16:
 271         {
 272             Value* vGatherResult[2];
 273
 274             // TODO: vGatherMaskedVal
 275             Value* vGatherMaskedVal = VIMMED1((float)0);
 276
 277             // always have at least one component out of x or y to fetch
 278
 279             vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 280             // e.g. result of first 8x32bit integer gather for 16bit components
 281             // 256i - 0    1    2    3    4    5    6    7
 282             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 283             //
 284
 285             // if we have at least one component out of x or y to fetch
 286             if (info.numComps > 2)
 287             {
 288                 // offset base to the next components(zw) in the vertex to gather
 289                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 290
 291                 vGatherResult[1] =
 292                     GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 293                 // e.g. result of second 8x32bit integer gather for 16bit components
 294                 // 256i - 0    1    2    3    4    5    6    7
 295                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 296                 //
 297             }
 298             else
 299             {
 300                 vGatherResult[1] = vGatherMaskedVal;
 301             }
 302
 303             // Shuffle gathered components into place, each row is a component
 304             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 305         }
 306         break;
 307         case 32:
 308         {
 309             // apply defaults
 310             for (uint32_t i = 0; i < 4; ++i)
 311             {
 312                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
 313             }
 314
 315             for (uint32_t i = 0; i < info.numComps; i++)
 316             {
 317                 uint32_t swizzleIndex = info.swizzle[i];
 318
 319                 // Gather a SIMD of components
 320                 vGatherComponents[swizzleIndex] = GATHERPS(
 321                     vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 322
 323                 // offset base to the next component to gather
 324                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 325             }
 326         }
 327         break;
 328         default:
 329             SWR_INVALID("Invalid float format");
 330             break;
 331         }
 332     }
 333
 334     void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
 335                             Value*                 pSrcBase,
 336                             Value*                 byteOffsets,
 337                             Value*                 vMask,
 338                             Value*                 vGatherComponents[],
 339                             bool                   bPackedOutput,
 340                             JIT_MEM_CLIENT         usage)
 341     {
 342         switch (info.bpp / info.numComps)
 343         {
 344         case 8:
 345         {
 346             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 347             Value* vGatherResult =
 348                 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 349             // e.g. result of an 8x32bit integer gather for 8bit components
 350             // 256i - 0    1    2    3    4    5    6    7
 351             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 352
 353             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 354         }
 355         break;
 356         case 16:
 357         {
 358             Value* vGatherResult[2];
 359
 360             // TODO: vGatherMaskedVal
 361             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 362
 363             // always have at least one component out of x or y to fetch
 364
 365             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 366             // e.g. result of first 8x32bit integer gather for 16bit components
 367             // 256i - 0    1    2    3    4    5    6    7
 368             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 369             //
 370
 371             // if we have at least one component out of x or y to fetch
 372             if (info.numComps > 2)
 373             {
 374                 // offset base to the next components(zw) in the vertex to gather
 375                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 376
 377                 vGatherResult[1] =
 378                     GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 379                 // e.g. result of second 8x32bit integer gather for 16bit components
 380                 // 256i - 0    1    2    3    4    5    6    7
 381                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 382                 //
 383             }
 384             else
 385             {
 386                 vGatherResult[1] = vGatherMaskedVal;
 387             }
 388
 389             // Shuffle gathered components into place, each row is a component
 390             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 391         }
 392         break;
 393         case 32:
 394         {
 395             // apply defaults
 396             for (uint32_t i = 0; i < 4; ++i)
 397             {
 398                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
 399             }
 400
 401             for (uint32_t i = 0; i < info.numComps; i++)
 402             {
 403                 uint32_t swizzleIndex = info.swizzle[i];
 404
 405                 // Gather a SIMD of components
 406                 vGatherComponents[swizzleIndex] = GATHERDD(
 407                     vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 408
 409                 // offset base to the next component to gather
 410                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 411             }
 412         }
 413         break;
 414         default:
 415             SWR_INVALID("unsupported format");
 416             break;
 417         }
 418     }
 419
 420     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
 421                                       Value*                 vGatherInput[2],
 422                                       Value*                 vGatherOutput[4],
 423                                       bool                   bPackedOutput)
 424     {
 425         // cast types
 426         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 427         Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 428
 429         // input could either be float or int vector; do shuffle work in int
 430         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
 431         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
 432
 433         if (bPackedOutput)
 434         {
 435             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
 436                                               mVWidth / 4); // vwidth is units of 32 bits
 437
 438             // shuffle mask
 439             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 440                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
 441             Value* vShufResult =
 442                 BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
 443             // after pshufb: group components together in each 128bit lane
 444             // 256i - 0    1    2    3    4    5    6    7
 445             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 446
 447             Value* vi128XY =
 448                 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
 449             // after PERMD: move and pack xy components into each 128bit lane
 450             // 256i - 0    1    2    3    4    5    6    7
 451             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
 452
 453             // do the same for zw components
 454             Value* vi128ZW = nullptr;
 455             if (info.numComps > 2)
 456             {
 457                 Value* vShufResult =
 458                     BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
 459                 vi128ZW =
 460                     BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
 461             }
 462
 463             for (uint32_t i = 0; i < 4; i++)
 464             {
 465                 uint32_t swizzleIndex = info.swizzle[i];
 466                 // todo: fixed for packed
 467                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 468                 if (i >= info.numComps)
 469                 {
 470                     // set the default component val
 471                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 472                     continue;
 473                 }
 474
 475                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 476                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 477                 // if x or y, use vi128XY permute result, else use vi128ZW
 478                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 479
 480                 // extract packed component 128 bit lanes
 481                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 482             }
 483         }
 484         else
 485         {
 486             // pshufb masks for each component
 487             Value* vConstMask[2];
 488             // x/z shuffle mask
 489             vConstMask[0] = C<char>({
 490                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
 491                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
 492             });
 493
 494             // y/w shuffle mask
 495             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
 496                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
 497
 498             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
 499             // apply defaults
 500             for (uint32_t i = 0; i < 4; ++i)
 501             {
 502                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 503             }
 504
 505             for (uint32_t i = 0; i < info.numComps; i++)
 506             {
 507                 uint32_t swizzleIndex = info.swizzle[i];
 508
 509                 // select correct constMask for x/z or y/w pshufb
 510                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
 511                 // if x or y, use vi128XY permute result, else use vi128ZW
 512                 uint32_t selectedGather = (i < 2) ? 0 : 1;
 513
 514                 vGatherOutput[swizzleIndex] =
 515                     BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
 516                                    vConstMask[selectedMask]),
 517                             vGatherTy);
 518                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
 519                 // 256i - 0    1    2    3    4    5    6    7
 520                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
 521             }
 522         }
 523     }
 524
 525     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
 526                                      Value*                 vGatherInput,
 527                                      Value*                 vGatherOutput[],
 528                                      bool                   bPackedOutput)
 529     {
 530         // cast types
 531         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 532         Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 533
 534         if (bPackedOutput)
 535         {
 536             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
 537                                            mVWidth / 4); // vwidth is units of 32 bits
 538                                                          // shuffle mask
 539             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 540                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
 541             Value* vShufResult =
 542                 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 543             // after pshufb: group components together in each 128bit lane
 544             // 256i - 0    1    2    3    4    5    6    7
 545             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 546
 547             Value* vi128XY =
 548                 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
 549             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
 550             // 256i - 0    1    2    3    4    5    6    7
 551             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
 552
 553             // do the same for zw components
 554             Value* vi128ZW = nullptr;
 555             if (info.numComps > 2)
 556             {
 557                 vi128ZW =
 558                     BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
 559             }
 560
 561             // sign extend all enabled components. If we have a fill vVertexElements, output to
 562             // current simdvertex
 563             for (uint32_t i = 0; i < 4; i++)
 564             {
 565                 uint32_t swizzleIndex = info.swizzle[i];
 566                 // todo: fix for packed
 567                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 568                 if (i >= info.numComps)
 569                 {
 570                     // set the default component val
 571                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 572                     continue;
 573                 }
 574
 575                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 576                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 577                 // if x or y, use vi128XY permute result, else use vi128ZW
 578                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 579
 580                 // sign extend
 581                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 582             }
 583         }
 584         // else zero extend
 585         else
 586         {
 587             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
 588             // apply defaults
 589             for (uint32_t i = 0; i < 4; ++i)
 590             {
 591                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 592             }
 593
 594             for (uint32_t i = 0; i < info.numComps; i++)
 595             {
 596                 uint32_t swizzleIndex = info.swizzle[i];
 597
 598                 // pshufb masks for each component
 599                 Value* vConstMask;
 600                 switch (i)
 601                 {
 602                 case 0:
 603                     // x shuffle mask
 604                     vConstMask =
 605                         C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
 606                                  0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
 607                     break;
 608                 case 1:
 609                     // y shuffle mask
 610                     vConstMask =
 611                         C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
 612                                  1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
 613                     break;
 614                 case 2:
 615                     // z shuffle mask
 616                     vConstMask =
 617                         C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
 618                                  2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
 619                     break;
 620                 case 3:
 621                     // w shuffle mask
 622                     vConstMask =
 623                         C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
 624                                  3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
 625                     break;
 626                 default:
 627                     vConstMask = nullptr;
 628                     break;
 629                 }
 630
 631                 vGatherOutput[swizzleIndex] =
 632                     BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 633                 // after pshufb for x channel
 634                 // 256i - 0    1    2    3    4    5    6    7
 635                 //        x000 x000 x000 x000 x000 x000 x000 x000
 636             }
 637         }
 638     }
 639
 640     //////////////////////////////////////////////////////////////////////////
 641     /// @brief emulates a scatter operation.
 642     /// @param pDst - pointer to destination
 643     /// @param vSrc - vector of src data to scatter
 644     /// @param vOffsets - vector of byte offsets from pDst
 645     /// @param vMask - mask of valid lanes
 646     void Builder::SCATTERPS(
 647         Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, JIT_MEM_CLIENT usage)
 648     {
 649         AssertMemoryUsageParams(pDst, usage);
 650
 651         /* Scatter algorithm
 652
 653         while(Index = BitScanForward(mask))
 654         srcElem = srcVector[Index]
 655         offsetElem = offsetVector[Index]
 656         *(pDst + offsetElem) = srcElem
 657         Update mask (&= ~(1<<Index)
 658
 659         */
 660
 661         BasicBlock* pCurBB = IRB()->GetInsertBlock();
 662         Function*   pFunc  = pCurBB->getParent();
 663         Type*       pSrcTy = vSrc->getType()->getVectorElementType();
 664
 665         // Store vectors on stack
 666         if (pScatterStackSrc == nullptr)
 667         {
 668             // Save off stack allocations and reuse per scatter. Significantly reduces stack
 669             // requirements for shaders with a lot of scatters.
 670             pScatterStackSrc     = CreateEntryAlloca(pFunc, mSimdInt64Ty);
 671             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
 672         }
 673
 674         Value* pSrcArrayPtr     = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
 675         Value* pOffsetsArrayPtr = pScatterStackOffsets;
 676         STORE(vSrc, pSrcArrayPtr);
 677         STORE(vOffsets, pOffsetsArrayPtr);
 678
 679         // Cast to pointers for random access
 680         pSrcArrayPtr     = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
 681         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
 682
 683         Value* pMask = VMOVMSK(vMask);
 684
 685         // Setup loop basic block
 686         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
 687
 688         // compute first set bit
 689         Value* pIndex = CTTZ(pMask, C(false));
 690
 691         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
 692
 693         // Split current block or create new one if building inline
 694         BasicBlock* pPostLoop;
 695         if (pCurBB->getTerminator())
 696         {
 697             pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
 698
 699             // Remove unconditional jump created by splitBasicBlock
 700             pCurBB->getTerminator()->eraseFromParent();
 701
 702             // Add terminator to end of original block
 703             IRB()->SetInsertPoint(pCurBB);
 704
 705             // Add conditional branch
 706             COND_BR(pIsUndef, pPostLoop, pLoop);
 707         }
 708         else
 709         {
 710             pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
 711
 712             // Add conditional branch
 713             COND_BR(pIsUndef, pPostLoop, pLoop);
 714         }
 715
 716         // Add loop basic block contents
 717         IRB()->SetInsertPoint(pLoop);
 718         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
 719         PHINode* pMaskPhi  = PHI(mInt32Ty, 2);
 720
 721         pIndexPhi->addIncoming(pIndex, pCurBB);
 722         pMaskPhi->addIncoming(pMask, pCurBB);
 723
 724         // Extract elements for this index
 725         Value* pSrcElem    = LOADV(pSrcArrayPtr, {pIndexPhi});
 726         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
 727
 728         // GEP to this offset in dst
 729         Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
 730         pCurDst        = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
 731         STORE(pSrcElem, pCurDst);
 732
 733         // Update the mask
 734         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
 735
 736         // Terminator
 737         Value* pNewIndex = CTTZ(pNewMask, C(false));
 738
 739         pIsUndef = ICMP_EQ(pNewIndex, C(32));
 740         COND_BR(pIsUndef, pPostLoop, pLoop);
 741
 742         // Update phi edges
 743         pIndexPhi->addIncoming(pNewIndex, pLoop);
 744         pMaskPhi->addIncoming(pNewMask, pLoop);
 745
 746         // Move builder to beginning of post loop
 747         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
 748     }
 749 } // namespace SwrJit