src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file builder_misc.cpp
  24  *
  25  * @brief Implementation for miscellaneous builder functions
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32
  33 #include <cstdarg>
  34
  35 namespace SwrJit
  36 {
  37     void Builder::AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage)
  38     {
  39         SWR_ASSERT(
  40             ptr->getType() != mInt64Ty,
  41             "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
  42     }
  43
  44     Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name)
  45     {
  46         return IRB()->CreateGEP(Ptr, Idx, Name);
  47     }
  48
  49     Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
  50     {
  51         return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
  52     }
  53
  54     Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
  55     {
  56         std::vector<Value*> indices;
  57         for (auto i : indexList)
  58             indices.push_back(i);
  59         return GEPA(ptr, indices);
  60     }
  61
  62     Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
  63     {
  64         std::vector<Value*> indices;
  65         for (auto i : indexList)
  66             indices.push_back(C(i));
  67         return GEPA(ptr, indices);
  68     }
  69
  70     Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
  71     {
  72         return IRB()->CreateGEP(Ptr, IdxList, Name);
  73     }
  74
  75     Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
  76     {
  77         return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
  78     }
  79
  80     Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
  81     {
  82         std::vector<Value*> indices;
  83         for (auto i : indexList)
  84             indices.push_back(i);
  85         return IN_BOUNDS_GEP(ptr, indices);
  86     }
  87
  88     Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
  89     {
  90         std::vector<Value*> indices;
  91         for (auto i : indexList)
  92             indices.push_back(C(i));
  93         return IN_BOUNDS_GEP(ptr, indices);
  94     }
  95
  96     LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage)
  97     {
  98         AssertMemoryUsageParams(Ptr, usage);
  99         return IRB()->CreateLoad(Ptr, Name);
 100     }
 101
 102     LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage)
 103     {
 104         AssertMemoryUsageParams(Ptr, usage);
 105         return IRB()->CreateLoad(Ptr, Name);
 106     }
 107
 108     LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, MEM_CLIENT usage)
 109     {
 110         AssertMemoryUsageParams(Ptr, usage);
 111         return IRB()->CreateLoad(Ty, Ptr, Name);
 112     }
 113
 114     LoadInst*
 115     Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage)
 116     {
 117         AssertMemoryUsageParams(Ptr, usage);
 118         return IRB()->CreateLoad(Ptr, isVolatile, Name);
 119     }
 120
 121     LoadInst* Builder::LOAD(Value*                                 basePtr,
 122                             const std::initializer_list<uint32_t>& indices,
 123                             const llvm::Twine&                     name,
 124                             Type*                                  Ty,
 125                             MEM_CLIENT                             usage)
 126     {
 127         std::vector<Value*> valIndices;
 128         for (auto i : indices)
 129             valIndices.push_back(C(i));
 130         return Builder::LOAD(GEPA(basePtr, valIndices), name);
 131     }
 132
 133     LoadInst* Builder::LOADV(Value*                               basePtr,
 134                              const std::initializer_list<Value*>& indices,
 135                              const llvm::Twine&                   name)
 136     {
 137         std::vector<Value*> valIndices;
 138         for (auto i : indices)
 139             valIndices.push_back(i);
 140         return LOAD(GEPA(basePtr, valIndices), name);
 141     }
 142
 143     StoreInst*
 144     Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, MEM_CLIENT usage)
 145     {
 146         std::vector<Value*> valIndices;
 147         for (auto i : indices)
 148             valIndices.push_back(C(i));
 149         return STORE(val, GEPA(basePtr, valIndices));
 150     }
 151
 152     StoreInst*
 153     Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
 154     {
 155         std::vector<Value*> valIndices;
 156         for (auto i : indices)
 157             valIndices.push_back(i);
 158         return STORE(val, GEPA(basePtr, valIndices));
 159     }
 160
 161     Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
 162     {
 163         return GEP(base, offset);
 164     }
 165
 166     Value* Builder::MEM_ADD(Value*                                 i32Incr,
 167                             Value*                                 basePtr,
 168                             const std::initializer_list<uint32_t>& indices,
 169                             const llvm::Twine&                     name)
 170     {
 171         Value* i32Value  = LOAD(GEP(basePtr, indices), name);
 172         Value* i32Result = ADD(i32Value, i32Incr);
 173         return STORE(i32Result, GEP(basePtr, indices));
 174     }
 175
 176     //////////////////////////////////////////////////////////////////////////
 177     /// @brief Generate a masked gather operation in LLVM IR.  If not
 178     /// supported on the underlying platform, emulate it with loads
 179     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 180     /// @param pBase - Int8* base VB address pointer value
 181     /// @param vIndices - SIMD wide value of VB byte offsets
 182     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 183     /// @param scale - value to scale indices by
 184     Value* Builder::GATHERPS(Value*         vSrc,
 185                              Value*         pBase,
 186                              Value*         vIndices,
 187                              Value*         vMask,
 188                              uint8_t        scale,
 189                              MEM_CLIENT     usage)
 190     {
 191         AssertMemoryUsageParams(pBase, usage);
 192
 193         return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
 194     }
 195
 196     //////////////////////////////////////////////////////////////////////////
 197     /// @brief Generate a masked gather operation in LLVM IR.  If not
 198     /// supported on the underlying platform, emulate it with loads
 199     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 200     /// @param pBase - Int8* base VB address pointer value
 201     /// @param vIndices - SIMD wide value of VB byte offsets
 202     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 203     /// @param scale - value to scale indices by
 204     Value* Builder::GATHERDD(Value*         vSrc,
 205                              Value*         pBase,
 206                              Value*         vIndices,
 207                              Value*         vMask,
 208                              uint8_t        scale,
 209                              MEM_CLIENT     usage)
 210     {
 211         AssertMemoryUsageParams(pBase, usage);
 212
 213         return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
 214     }
 215
 216     //////////////////////////////////////////////////////////////////////////
 217     /// @brief Generate a masked gather operation in LLVM IR.  If not
 218     /// supported on the underlying platform, emulate it with loads
 219     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 220     /// @param pBase - Int8* base VB address pointer value
 221     /// @param vIndices - SIMD wide value of VB byte offsets
 222     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 223     /// @param scale - value to scale indices by
 224     Value*
 225     Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 226     {
 227         return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 228     }
 229
 230     //////////////////////////////////////////////////////////////////////////
 231     /// @brief Alternative masked gather where source is a vector of pointers
 232     /// @param pVecSrcPtr   - SIMD wide vector of pointers
 233     /// @param pVecMask     - SIMD active lanes
 234     /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
 235     Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
 236     {
 237         return MASKED_GATHER(pVecSrcPtr, AlignType(4), pVecMask, pVecPassthru);
 238     }
 239
 240     void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask)
 241     {
 242         MASKED_SCATTER(pVecSrc, pVecDstPtr, AlignType(4), pVecMask);
 243     }
 244
 245     void Builder::Gather4(const SWR_FORMAT format,
 246                           Value*           pSrcBase,
 247                           Value*           byteOffsets,
 248                           Value*           mask,
 249                           Value*           vGatherComponents[],
 250                           bool             bPackedOutput,
 251                           MEM_CLIENT       usage)
 252     {
 253         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 254         if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 255         {
 256             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
 257         }
 258         else
 259         {
 260             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
 261         }
 262     }
 263
 264     void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
 265                             Value*                 pSrcBase,
 266                             Value*                 byteOffsets,
 267                             Value*                 vMask,
 268                             Value*                 vGatherComponents[],
 269                             bool                   bPackedOutput,
 270                             MEM_CLIENT             usage)
 271     {
 272         switch (info.bpp / info.numComps)
 273         {
 274         case 16:
 275         {
 276             Value* vGatherResult[2];
 277
 278             // TODO: vGatherMaskedVal
 279             Value* vGatherMaskedVal = VIMMED1((float)0);
 280
 281             // always have at least one component out of x or y to fetch
 282
 283             vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 284             // e.g. result of first 8x32bit integer gather for 16bit components
 285             // 256i - 0    1    2    3    4    5    6    7
 286             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 287             //
 288
 289             // if we have at least one component out of x or y to fetch
 290             if (info.numComps > 2)
 291             {
 292                 // offset base to the next components(zw) in the vertex to gather
 293                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 294
 295                 vGatherResult[1] =
 296                     GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 297                 // e.g. result of second 8x32bit integer gather for 16bit components
 298                 // 256i - 0    1    2    3    4    5    6    7
 299                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 300                 //
 301             }
 302             else
 303             {
 304                 vGatherResult[1] = vGatherMaskedVal;
 305             }
 306
 307             // Shuffle gathered components into place, each row is a component
 308             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 309         }
 310         break;
 311         case 32:
 312         {
 313             // apply defaults
 314             for (uint32_t i = 0; i < 4; ++i)
 315             {
 316                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
 317             }
 318
 319             for (uint32_t i = 0; i < info.numComps; i++)
 320             {
 321                 uint32_t swizzleIndex = info.swizzle[i];
 322
 323                 // Gather a SIMD of components
 324                 vGatherComponents[swizzleIndex] = GATHERPS(
 325                     vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 326
 327                 // offset base to the next component to gather
 328                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 329             }
 330         }
 331         break;
 332         default:
 333             SWR_INVALID("Invalid float format");
 334             break;
 335         }
 336     }
 337
 338     void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
 339                             Value*                 pSrcBase,
 340                             Value*                 byteOffsets,
 341                             Value*                 vMask,
 342                             Value*                 vGatherComponents[],
 343                             bool                   bPackedOutput,
 344                             MEM_CLIENT             usage)
 345     {
 346         switch (info.bpp / info.numComps)
 347         {
 348         case 8:
 349         {
 350             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 351             Value* vGatherResult =
 352                 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 353             // e.g. result of an 8x32bit integer gather for 8bit components
 354             // 256i - 0    1    2    3    4    5    6    7
 355             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 356
 357             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 358         }
 359         break;
 360         case 16:
 361         {
 362             Value* vGatherResult[2];
 363
 364             // TODO: vGatherMaskedVal
 365             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 366
 367             // always have at least one component out of x or y to fetch
 368
 369             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 370             // e.g. result of first 8x32bit integer gather for 16bit components
 371             // 256i - 0    1    2    3    4    5    6    7
 372             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 373             //
 374
 375             // if we have at least one component out of x or y to fetch
 376             if (info.numComps > 2)
 377             {
 378                 // offset base to the next components(zw) in the vertex to gather
 379                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 380
 381                 vGatherResult[1] =
 382                     GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 383                 // e.g. result of second 8x32bit integer gather for 16bit components
 384                 // 256i - 0    1    2    3    4    5    6    7
 385                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 386                 //
 387             }
 388             else
 389             {
 390                 vGatherResult[1] = vGatherMaskedVal;
 391             }
 392
 393             // Shuffle gathered components into place, each row is a component
 394             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 395         }
 396         break;
 397         case 32:
 398         {
 399             // apply defaults
 400             for (uint32_t i = 0; i < 4; ++i)
 401             {
 402                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
 403             }
 404
 405             for (uint32_t i = 0; i < info.numComps; i++)
 406             {
 407                 uint32_t swizzleIndex = info.swizzle[i];
 408
 409                 // Gather a SIMD of components
 410                 vGatherComponents[swizzleIndex] = GATHERDD(
 411                     vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 412
 413                 // offset base to the next component to gather
 414                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 415             }
 416         }
 417         break;
 418         default:
 419             SWR_INVALID("unsupported format");
 420             break;
 421         }
 422     }
 423
 424     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
 425                                       Value*                 vGatherInput[2],
 426                                       Value*                 vGatherOutput[4],
 427                                       bool                   bPackedOutput)
 428     {
 429         // cast types
 430         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 431         Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 432
 433         // input could either be float or int vector; do shuffle work in int
 434         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
 435         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
 436
 437         if (bPackedOutput)
 438         {
 439             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
 440                                               mVWidth / 4); // vwidth is units of 32 bits
 441
 442             // shuffle mask
 443             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 444                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
 445             Value* vShufResult =
 446                 BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
 447             // after pshufb: group components together in each 128bit lane
 448             // 256i - 0    1    2    3    4    5    6    7
 449             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 450
 451             Value* vi128XY =
 452                 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
 453             // after PERMD: move and pack xy components into each 128bit lane
 454             // 256i - 0    1    2    3    4    5    6    7
 455             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
 456
 457             // do the same for zw components
 458             Value* vi128ZW = nullptr;
 459             if (info.numComps > 2)
 460             {
 461                 Value* vShufResult =
 462                     BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
 463                 vi128ZW =
 464                     BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
 465             }
 466
 467             for (uint32_t i = 0; i < 4; i++)
 468             {
 469                 uint32_t swizzleIndex = info.swizzle[i];
 470                 // todo: fixed for packed
 471                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 472                 if (i >= info.numComps)
 473                 {
 474                     // set the default component val
 475                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 476                     continue;
 477                 }
 478
 479                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 480                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 481                 // if x or y, use vi128XY permute result, else use vi128ZW
 482                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 483
 484                 // extract packed component 128 bit lanes
 485                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 486             }
 487         }
 488         else
 489         {
 490             // pshufb masks for each component
 491             Value* vConstMask[2];
 492             // x/z shuffle mask
 493             vConstMask[0] = C<char>({
 494                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
 495                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
 496             });
 497
 498             // y/w shuffle mask
 499             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
 500                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
 501
 502             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
 503             // apply defaults
 504             for (uint32_t i = 0; i < 4; ++i)
 505             {
 506                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 507             }
 508
 509             for (uint32_t i = 0; i < info.numComps; i++)
 510             {
 511                 uint32_t swizzleIndex = info.swizzle[i];
 512
 513                 // select correct constMask for x/z or y/w pshufb
 514                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
 515                 // if x or y, use vi128XY permute result, else use vi128ZW
 516                 uint32_t selectedGather = (i < 2) ? 0 : 1;
 517
 518                 vGatherOutput[swizzleIndex] =
 519                     BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
 520                                    vConstMask[selectedMask]),
 521                             vGatherTy);
 522                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
 523                 // 256i - 0    1    2    3    4    5    6    7
 524                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
 525             }
 526         }
 527     }
 528
 529     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
 530                                      Value*                 vGatherInput,
 531                                      Value*                 vGatherOutput[],
 532                                      bool                   bPackedOutput)
 533     {
 534         // cast types
 535         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 536         Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 537
 538         if (bPackedOutput)
 539         {
 540             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
 541                                            mVWidth / 4); // vwidth is units of 32 bits
 542                                                          // shuffle mask
 543             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 544                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
 545             Value* vShufResult =
 546                 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 547             // after pshufb: group components together in each 128bit lane
 548             // 256i - 0    1    2    3    4    5    6    7
 549             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 550
 551             Value* vi128XY =
 552                 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
 553             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
 554             // 256i - 0    1    2    3    4    5    6    7
 555             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
 556
 557             // do the same for zw components
 558             Value* vi128ZW = nullptr;
 559             if (info.numComps > 2)
 560             {
 561                 vi128ZW =
 562                     BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
 563             }
 564
 565             // sign extend all enabled components. If we have a fill vVertexElements, output to
 566             // current simdvertex
 567             for (uint32_t i = 0; i < 4; i++)
 568             {
 569                 uint32_t swizzleIndex = info.swizzle[i];
 570                 // todo: fix for packed
 571                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 572                 if (i >= info.numComps)
 573                 {
 574                     // set the default component val
 575                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 576                     continue;
 577                 }
 578
 579                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 580                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 581                 // if x or y, use vi128XY permute result, else use vi128ZW
 582                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 583
 584                 // sign extend
 585                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 586             }
 587         }
 588         // else zero extend
 589         else
 590         {
 591             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
 592             // apply defaults
 593             for (uint32_t i = 0; i < 4; ++i)
 594             {
 595                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 596             }
 597
 598             for (uint32_t i = 0; i < info.numComps; i++)
 599             {
 600                 uint32_t swizzleIndex = info.swizzle[i];
 601
 602                 // pshufb masks for each component
 603                 Value* vConstMask;
 604                 switch (i)
 605                 {
 606                 case 0:
 607                     // x shuffle mask
 608                     vConstMask =
 609                         C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
 610                                  0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
 611                     break;
 612                 case 1:
 613                     // y shuffle mask
 614                     vConstMask =
 615                         C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
 616                                  1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
 617                     break;
 618                 case 2:
 619                     // z shuffle mask
 620                     vConstMask =
 621                         C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
 622                                  2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
 623                     break;
 624                 case 3:
 625                     // w shuffle mask
 626                     vConstMask =
 627                         C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
 628                                  3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
 629                     break;
 630                 default:
 631                     vConstMask = nullptr;
 632                     break;
 633                 }
 634
 635                 assert(vConstMask && "Invalid info.numComps value");
 636                 vGatherOutput[swizzleIndex] =
 637                     BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 638                 // after pshufb for x channel
 639                 // 256i - 0    1    2    3    4    5    6    7
 640                 //        x000 x000 x000 x000 x000 x000 x000 x000
 641             }
 642         }
 643     }
 644
 645     //////////////////////////////////////////////////////////////////////////
 646     /// @brief emulates a scatter operation.
 647     /// @param pDst - pointer to destination
 648     /// @param vSrc - vector of src data to scatter
 649     /// @param vOffsets - vector of byte offsets from pDst
 650     /// @param vMask - mask of valid lanes
 651     void Builder::SCATTERPS(
 652         Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage)
 653     {
 654         AssertMemoryUsageParams(pDst, usage);
 655 #if LLVM_VERSION_MAJOR >= 11
 656         SWR_ASSERT(cast<VectorType>(vSrc->getType())->getElementType()->isFloatTy());
 657 #else
 658         SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy());
 659 #endif
 660         VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1));
 661         return;
 662
 663         /* Scatter algorithm
 664
 665         while(Index = BitScanForward(mask))
 666         srcElem = srcVector[Index]
 667         offsetElem = offsetVector[Index]
 668         *(pDst + offsetElem) = srcElem
 669         Update mask (&= ~(1<<Index)
 670
 671         */
 672
 673         /*
 674
 675         // Reference implementation kept around for reference
 676
 677         BasicBlock* pCurBB = IRB()->GetInsertBlock();
 678         Function*   pFunc  = pCurBB->getParent();
 679         Type*       pSrcTy = vSrc->getType()->getVectorElementType();
 680
 681         // Store vectors on stack
 682         if (pScatterStackSrc == nullptr)
 683         {
 684             // Save off stack allocations and reuse per scatter. Significantly reduces stack
 685             // requirements for shaders with a lot of scatters.
 686             pScatterStackSrc     = CreateEntryAlloca(pFunc, mSimdInt64Ty);
 687             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
 688         }
 689
 690         Value* pSrcArrayPtr     = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
 691         Value* pOffsetsArrayPtr = pScatterStackOffsets;
 692         STORE(vSrc, pSrcArrayPtr);
 693         STORE(vOffsets, pOffsetsArrayPtr);
 694
 695         // Cast to pointers for random access
 696         pSrcArrayPtr     = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
 697         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
 698
 699         Value* pMask = VMOVMSK(vMask);
 700
 701         // Setup loop basic block
 702         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
 703
 704         // compute first set bit
 705         Value* pIndex = CTTZ(pMask, C(false));
 706
 707         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
 708
 709         // Split current block or create new one if building inline
 710         BasicBlock* pPostLoop;
 711         if (pCurBB->getTerminator())
 712         {
 713             pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
 714
 715             // Remove unconditional jump created by splitBasicBlock
 716             pCurBB->getTerminator()->eraseFromParent();
 717
 718             // Add terminator to end of original block
 719             IRB()->SetInsertPoint(pCurBB);
 720
 721             // Add conditional branch
 722             COND_BR(pIsUndef, pPostLoop, pLoop);
 723         }
 724         else
 725         {
 726             pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
 727
 728             // Add conditional branch
 729             COND_BR(pIsUndef, pPostLoop, pLoop);
 730         }
 731
 732         // Add loop basic block contents
 733         IRB()->SetInsertPoint(pLoop);
 734         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
 735         PHINode* pMaskPhi  = PHI(mInt32Ty, 2);
 736
 737         pIndexPhi->addIncoming(pIndex, pCurBB);
 738         pMaskPhi->addIncoming(pMask, pCurBB);
 739
 740         // Extract elements for this index
 741         Value* pSrcElem    = LOADV(pSrcArrayPtr, {pIndexPhi});
 742         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
 743
 744         // GEP to this offset in dst
 745         Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
 746         pCurDst        = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
 747         STORE(pSrcElem, pCurDst);
 748
 749         // Update the mask
 750         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
 751
 752         // Terminator
 753         Value* pNewIndex = CTTZ(pNewMask, C(false));
 754
 755         pIsUndef = ICMP_EQ(pNewIndex, C(32));
 756         COND_BR(pIsUndef, pPostLoop, pLoop);
 757
 758         // Update phi edges
 759         pIndexPhi->addIncoming(pNewIndex, pLoop);
 760         pMaskPhi->addIncoming(pNewMask, pLoop);
 761
 762         // Move builder to beginning of post loop
 763         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
 764
 765         */
 766     }
 767 } // namespace SwrJit