src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file builder_misc.cpp
  24  *
  25  * @brief Implementation for miscellaneous builder functions
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32
  33 #include <cstdarg>
  34
  35 namespace SwrJit
  36 {
  37     void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
  38     {
  39         SWR_ASSERT(
  40             ptr->getType() != mInt64Ty,
  41             "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
  42     }
  43
  44     Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name)
  45     {
  46         return IRB()->CreateGEP(Ptr, Idx, Name);
  47     }
  48
  49     Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
  50     {
  51         return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
  52     }
  53
  54     Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
  55     {
  56         std::vector<Value*> indices;
  57         for (auto i : indexList)
  58             indices.push_back(i);
  59         return GEPA(ptr, indices);
  60     }
  61
  62     Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
  63     {
  64         std::vector<Value*> indices;
  65         for (auto i : indexList)
  66             indices.push_back(C(i));
  67         return GEPA(ptr, indices);
  68     }
  69
  70     Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
  71     {
  72         return IRB()->CreateGEP(Ptr, IdxList, Name);
  73     }
  74
  75     Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
  76     {
  77         return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
  78     }
  79
  80     Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
  81     {
  82         std::vector<Value*> indices;
  83         for (auto i : indexList)
  84             indices.push_back(i);
  85         return IN_BOUNDS_GEP(ptr, indices);
  86     }
  87
  88     Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
  89     {
  90         std::vector<Value*> indices;
  91         for (auto i : indexList)
  92             indices.push_back(C(i));
  93         return IN_BOUNDS_GEP(ptr, indices);
  94     }
  95
  96     LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage)
  97     {
  98         AssertMemoryUsageParams(Ptr, usage);
  99         return IRB()->CreateLoad(Ptr, Name);
 100     }
 101
 102     LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
 103     {
 104         AssertMemoryUsageParams(Ptr, usage);
 105         return IRB()->CreateLoad(Ptr, Name);
 106     }
 107
 108     LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, JIT_MEM_CLIENT usage)
 109     {
 110         AssertMemoryUsageParams(Ptr, usage);
 111         return IRB()->CreateLoad(Ty, Ptr, Name);
 112     }
 113
 114     LoadInst*
 115     Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
 116     {
 117         AssertMemoryUsageParams(Ptr, usage);
 118         return IRB()->CreateLoad(Ptr, isVolatile, Name);
 119     }
 120
 121     LoadInst* Builder::LOAD(Value*                                 basePtr,
 122                             const std::initializer_list<uint32_t>& indices,
 123                             const llvm::Twine&                     name,
 124                             Type*                                  Ty,
 125                             JIT_MEM_CLIENT                         usage)
 126     {
 127         std::vector<Value*> valIndices;
 128         for (auto i : indices)
 129             valIndices.push_back(C(i));
 130         return Builder::LOAD(GEPA(basePtr, valIndices), name);
 131     }
 132
 133     LoadInst* Builder::LOADV(Value*                               basePtr,
 134                              const std::initializer_list<Value*>& indices,
 135                              const llvm::Twine&                   name)
 136     {
 137         std::vector<Value*> valIndices;
 138         for (auto i : indices)
 139             valIndices.push_back(i);
 140         return LOAD(GEPA(basePtr, valIndices), name);
 141     }
 142
 143     StoreInst*
 144     Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, JIT_MEM_CLIENT usage)
 145     {
 146         std::vector<Value*> valIndices;
 147         for (auto i : indices)
 148             valIndices.push_back(C(i));
 149         return STORE(val, GEPA(basePtr, valIndices));
 150     }
 151
 152     StoreInst*
 153     Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
 154     {
 155         std::vector<Value*> valIndices;
 156         for (auto i : indices)
 157             valIndices.push_back(i);
 158         return STORE(val, GEPA(basePtr, valIndices));
 159     }
 160
 161     Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
 162     {
 163         return GEP(base, offset);
 164     }
 165
 166     Value* Builder::MEM_ADD(Value*                                 i32Incr,
 167                             Value*                                 basePtr,
 168                             const std::initializer_list<uint32_t>& indices,
 169                             const llvm::Twine&                     name)
 170     {
 171         Value* i32Value  = LOAD(GEP(basePtr, indices), name);
 172         Value* i32Result = ADD(i32Value, i32Incr);
 173         return STORE(i32Result, GEP(basePtr, indices));
 174     }
 175
 176     //////////////////////////////////////////////////////////////////////////
 177     /// @brief Generate a masked gather operation in LLVM IR.  If not
 178     /// supported on the underlying platform, emulate it with loads
 179     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 180     /// @param pBase - Int8* base VB address pointer value
 181     /// @param vIndices - SIMD wide value of VB byte offsets
 182     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 183     /// @param scale - value to scale indices by
 184     Value* Builder::GATHERPS(Value*         vSrc,
 185                              Value*         pBase,
 186                              Value*         vIndices,
 187                              Value*         vMask,
 188                              uint8_t        scale,
 189                              JIT_MEM_CLIENT usage)
 190     {
 191         AssertMemoryUsageParams(pBase, usage);
 192
 193         return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
 194     }
 195
 196     //////////////////////////////////////////////////////////////////////////
 197     /// @brief Generate a masked gather operation in LLVM IR.  If not
 198     /// supported on the underlying platform, emulate it with loads
 199     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 200     /// @param pBase - Int8* base VB address pointer value
 201     /// @param vIndices - SIMD wide value of VB byte offsets
 202     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 203     /// @param scale - value to scale indices by
 204     Value* Builder::GATHERDD(Value*         vSrc,
 205                              Value*         pBase,
 206                              Value*         vIndices,
 207                              Value*         vMask,
 208                              uint8_t        scale,
 209                              JIT_MEM_CLIENT usage)
 210     {
 211         AssertMemoryUsageParams(pBase, usage);
 212
 213         return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
 214     }
 215
 216     //////////////////////////////////////////////////////////////////////////
 217     /// @brief Generate a masked gather operation in LLVM IR.  If not
 218     /// supported on the underlying platform, emulate it with loads
 219     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 220     /// @param pBase - Int8* base VB address pointer value
 221     /// @param vIndices - SIMD wide value of VB byte offsets
 222     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 223     /// @param scale - value to scale indices by
 224     Value*
 225     Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 226     {
 227         return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 228     }
 229
 230     //////////////////////////////////////////////////////////////////////////
 231     /// @brief Alternative masked gather where source is a vector of pointers
 232     /// @param pVecSrcPtr   - SIMD wide vector of pointers
 233     /// @param pVecMask     - SIMD active lanes
 234     /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
 235     Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
 236     {
 237         return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
 238     }
 239
 240     void Builder::Gather4(const SWR_FORMAT format,
 241                           Value*           pSrcBase,
 242                           Value*           byteOffsets,
 243                           Value*           mask,
 244                           Value*           vGatherComponents[],
 245                           bool             bPackedOutput,
 246                           JIT_MEM_CLIENT   usage)
 247     {
 248         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 249         if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 250         {
 251             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
 252         }
 253         else
 254         {
 255             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
 256         }
 257     }
 258
 259     void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
 260                             Value*                 pSrcBase,
 261                             Value*                 byteOffsets,
 262                             Value*                 vMask,
 263                             Value*                 vGatherComponents[],
 264                             bool                   bPackedOutput,
 265                             JIT_MEM_CLIENT         usage)
 266     {
 267         switch (info.bpp / info.numComps)
 268         {
 269         case 16:
 270         {
 271             Value* vGatherResult[2];
 272
 273             // TODO: vGatherMaskedVal
 274             Value* vGatherMaskedVal = VIMMED1((float)0);
 275
 276             // always have at least one component out of x or y to fetch
 277
 278             vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 279             // e.g. result of first 8x32bit integer gather for 16bit components
 280             // 256i - 0    1    2    3    4    5    6    7
 281             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 282             //
 283
 284             // if we have at least one component out of x or y to fetch
 285             if (info.numComps > 2)
 286             {
 287                 // offset base to the next components(zw) in the vertex to gather
 288                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 289
 290                 vGatherResult[1] =
 291                     GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 292                 // e.g. result of second 8x32bit integer gather for 16bit components
 293                 // 256i - 0    1    2    3    4    5    6    7
 294                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 295                 //
 296             }
 297             else
 298             {
 299                 vGatherResult[1] = vGatherMaskedVal;
 300             }
 301
 302             // Shuffle gathered components into place, each row is a component
 303             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 304         }
 305         break;
 306         case 32:
 307         {
 308             // apply defaults
 309             for (uint32_t i = 0; i < 4; ++i)
 310             {
 311                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
 312             }
 313
 314             for (uint32_t i = 0; i < info.numComps; i++)
 315             {
 316                 uint32_t swizzleIndex = info.swizzle[i];
 317
 318                 // Gather a SIMD of components
 319                 vGatherComponents[swizzleIndex] = GATHERPS(
 320                     vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 321
 322                 // offset base to the next component to gather
 323                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 324             }
 325         }
 326         break;
 327         default:
 328             SWR_INVALID("Invalid float format");
 329             break;
 330         }
 331     }
 332
 333     void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
 334                             Value*                 pSrcBase,
 335                             Value*                 byteOffsets,
 336                             Value*                 vMask,
 337                             Value*                 vGatherComponents[],
 338                             bool                   bPackedOutput,
 339                             JIT_MEM_CLIENT         usage)
 340     {
 341         switch (info.bpp / info.numComps)
 342         {
 343         case 8:
 344         {
 345             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 346             Value* vGatherResult =
 347                 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 348             // e.g. result of an 8x32bit integer gather for 8bit components
 349             // 256i - 0    1    2    3    4    5    6    7
 350             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 351
 352             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 353         }
 354         break;
 355         case 16:
 356         {
 357             Value* vGatherResult[2];
 358
 359             // TODO: vGatherMaskedVal
 360             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 361
 362             // always have at least one component out of x or y to fetch
 363
 364             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 365             // e.g. result of first 8x32bit integer gather for 16bit components
 366             // 256i - 0    1    2    3    4    5    6    7
 367             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 368             //
 369
 370             // if we have at least one component out of x or y to fetch
 371             if (info.numComps > 2)
 372             {
 373                 // offset base to the next components(zw) in the vertex to gather
 374                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 375
 376                 vGatherResult[1] =
 377                     GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 378                 // e.g. result of second 8x32bit integer gather for 16bit components
 379                 // 256i - 0    1    2    3    4    5    6    7
 380                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 381                 //
 382             }
 383             else
 384             {
 385                 vGatherResult[1] = vGatherMaskedVal;
 386             }
 387
 388             // Shuffle gathered components into place, each row is a component
 389             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 390         }
 391         break;
 392         case 32:
 393         {
 394             // apply defaults
 395             for (uint32_t i = 0; i < 4; ++i)
 396             {
 397                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
 398             }
 399
 400             for (uint32_t i = 0; i < info.numComps; i++)
 401             {
 402                 uint32_t swizzleIndex = info.swizzle[i];
 403
 404                 // Gather a SIMD of components
 405                 vGatherComponents[swizzleIndex] = GATHERDD(
 406                     vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 407
 408                 // offset base to the next component to gather
 409                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 410             }
 411         }
 412         break;
 413         default:
 414             SWR_INVALID("unsupported format");
 415             break;
 416         }
 417     }
 418
 419     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
 420                                       Value*                 vGatherInput[2],
 421                                       Value*                 vGatherOutput[4],
 422                                       bool                   bPackedOutput)
 423     {
 424         // cast types
 425         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 426         Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 427
 428         // input could either be float or int vector; do shuffle work in int
 429         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
 430         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
 431
 432         if (bPackedOutput)
 433         {
 434             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
 435                                               mVWidth / 4); // vwidth is units of 32 bits
 436
 437             // shuffle mask
 438             Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 439                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
 440             Value* vShufResult =
 441                 BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
 442             // after pshufb: group components together in each 128bit lane
 443             // 256i - 0    1    2    3    4    5    6    7
 444             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 445
 446             Value* vi128XY =
 447                 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
 448             // after PERMD: move and pack xy components into each 128bit lane
 449             // 256i - 0    1    2    3    4    5    6    7
 450             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
 451
 452             // do the same for zw components
 453             Value* vi128ZW = nullptr;
 454             if (info.numComps > 2)
 455             {
 456                 Value* vShufResult =
 457                     BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
 458                 vi128ZW =
 459                     BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
 460             }
 461
 462             for (uint32_t i = 0; i < 4; i++)
 463             {
 464                 uint32_t swizzleIndex = info.swizzle[i];
 465                 // todo: fixed for packed
 466                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 467                 if (i >= info.numComps)
 468                 {
 469                     // set the default component val
 470                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 471                     continue;
 472                 }
 473
 474                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 475                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 476                 // if x or y, use vi128XY permute result, else use vi128ZW
 477                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 478
 479                 // extract packed component 128 bit lanes
 480                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 481             }
 482         }
 483         else
 484         {
 485             // pshufb masks for each component
 486             Value* vConstMask[2];
 487             // x/z shuffle mask
 488             vConstMask[0] = C<char>({
 489                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
 490                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
 491             });
 492
 493             // y/w shuffle mask
 494             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
 495                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
 496
 497             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
 498             // apply defaults
 499             for (uint32_t i = 0; i < 4; ++i)
 500             {
 501                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 502             }
 503
 504             for (uint32_t i = 0; i < info.numComps; i++)
 505             {
 506                 uint32_t swizzleIndex = info.swizzle[i];
 507
 508                 // select correct constMask for x/z or y/w pshufb
 509                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
 510                 // if x or y, use vi128XY permute result, else use vi128ZW
 511                 uint32_t selectedGather = (i < 2) ? 0 : 1;
 512
 513                 vGatherOutput[swizzleIndex] =
 514                     BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
 515                                    vConstMask[selectedMask]),
 516                             vGatherTy);
 517                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
 518                 // 256i - 0    1    2    3    4    5    6    7
 519                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
 520             }
 521         }
 522     }
 523
 524     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
 525                                      Value*                 vGatherInput,
 526                                      Value*                 vGatherOutput[],
 527                                      bool                   bPackedOutput)
 528     {
 529         // cast types
 530         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 531         Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 532
 533         if (bPackedOutput)
 534         {
 535             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
 536                                            mVWidth / 4); // vwidth is units of 32 bits
 537                                                          // shuffle mask
 538             Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 539                                          0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
 540             Value* vShufResult =
 541                 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 542             // after pshufb: group components together in each 128bit lane
 543             // 256i - 0    1    2    3    4    5    6    7
 544             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 545
 546             Value* vi128XY =
 547                 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
 548             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
 549             // 256i - 0    1    2    3    4    5    6    7
 550             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
 551
 552             // do the same for zw components
 553             Value* vi128ZW = nullptr;
 554             if (info.numComps > 2)
 555             {
 556                 vi128ZW =
 557                     BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
 558             }
 559
 560             // sign extend all enabled components. If we have a fill vVertexElements, output to
 561             // current simdvertex
 562             for (uint32_t i = 0; i < 4; i++)
 563             {
 564                 uint32_t swizzleIndex = info.swizzle[i];
 565                 // todo: fix for packed
 566                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 567                 if (i >= info.numComps)
 568                 {
 569                     // set the default component val
 570                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 571                     continue;
 572                 }
 573
 574                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 575                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 576                 // if x or y, use vi128XY permute result, else use vi128ZW
 577                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 578
 579                 // sign extend
 580                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 581             }
 582         }
 583         // else zero extend
 584         else
 585         {
 586             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
 587             // apply defaults
 588             for (uint32_t i = 0; i < 4; ++i)
 589             {
 590                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 591             }
 592
 593             for (uint32_t i = 0; i < info.numComps; i++)
 594             {
 595                 uint32_t swizzleIndex = info.swizzle[i];
 596
 597                 // pshufb masks for each component
 598                 Value* vConstMask;
 599                 switch (i)
 600                 {
 601                 case 0:
 602                     // x shuffle mask
 603                     vConstMask =
 604                         C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
 605                                  0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
 606                     break;
 607                 case 1:
 608                     // y shuffle mask
 609                     vConstMask =
 610                         C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
 611                                  1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
 612                     break;
 613                 case 2:
 614                     // z shuffle mask
 615                     vConstMask =
 616                         C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
 617                                  2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
 618                     break;
 619                 case 3:
 620                     // w shuffle mask
 621                     vConstMask =
 622                         C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
 623                                  3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
 624                     break;
 625                 default:
 626                     vConstMask = nullptr;
 627                     break;
 628                 }
 629
 630                 vGatherOutput[swizzleIndex] =
 631                     BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 632                 // after pshufb for x channel
 633                 // 256i - 0    1    2    3    4    5    6    7
 634                 //        x000 x000 x000 x000 x000 x000 x000 x000
 635             }
 636         }
 637     }
 638
 639     //////////////////////////////////////////////////////////////////////////
 640     /// @brief emulates a scatter operation.
 641     /// @param pDst - pointer to destination
 642     /// @param vSrc - vector of src data to scatter
 643     /// @param vOffsets - vector of byte offsets from pDst
 644     /// @param vMask - mask of valid lanes
 645     void Builder::SCATTERPS(
 646         Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, JIT_MEM_CLIENT usage)
 647     {
 648         AssertMemoryUsageParams(pDst, usage);
 649
 650         SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy());
 651         VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1));
 652         return;
 653
 654         /* Scatter algorithm
 655
 656         while(Index = BitScanForward(mask))
 657         srcElem = srcVector[Index]
 658         offsetElem = offsetVector[Index]
 659         *(pDst + offsetElem) = srcElem
 660         Update mask (&= ~(1<<Index)
 661
 662         */
 663
 664         /*
 665
 666         // Reference implementation kept around for reference
 667
 668         BasicBlock* pCurBB = IRB()->GetInsertBlock();
 669         Function*   pFunc  = pCurBB->getParent();
 670         Type*       pSrcTy = vSrc->getType()->getVectorElementType();
 671
 672         // Store vectors on stack
 673         if (pScatterStackSrc == nullptr)
 674         {
 675             // Save off stack allocations and reuse per scatter. Significantly reduces stack
 676             // requirements for shaders with a lot of scatters.
 677             pScatterStackSrc     = CreateEntryAlloca(pFunc, mSimdInt64Ty);
 678             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
 679         }
 680
 681         Value* pSrcArrayPtr     = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
 682         Value* pOffsetsArrayPtr = pScatterStackOffsets;
 683         STORE(vSrc, pSrcArrayPtr);
 684         STORE(vOffsets, pOffsetsArrayPtr);
 685
 686         // Cast to pointers for random access
 687         pSrcArrayPtr     = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
 688         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
 689
 690         Value* pMask = VMOVMSK(vMask);
 691
 692         // Setup loop basic block
 693         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
 694
 695         // compute first set bit
 696         Value* pIndex = CTTZ(pMask, C(false));
 697
 698         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
 699
 700         // Split current block or create new one if building inline
 701         BasicBlock* pPostLoop;
 702         if (pCurBB->getTerminator())
 703         {
 704             pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
 705
 706             // Remove unconditional jump created by splitBasicBlock
 707             pCurBB->getTerminator()->eraseFromParent();
 708
 709             // Add terminator to end of original block
 710             IRB()->SetInsertPoint(pCurBB);
 711
 712             // Add conditional branch
 713             COND_BR(pIsUndef, pPostLoop, pLoop);
 714         }
 715         else
 716         {
 717             pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
 718
 719             // Add conditional branch
 720             COND_BR(pIsUndef, pPostLoop, pLoop);
 721         }
 722
 723         // Add loop basic block contents
 724         IRB()->SetInsertPoint(pLoop);
 725         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
 726         PHINode* pMaskPhi  = PHI(mInt32Ty, 2);
 727
 728         pIndexPhi->addIncoming(pIndex, pCurBB);
 729         pMaskPhi->addIncoming(pMask, pCurBB);
 730
 731         // Extract elements for this index
 732         Value* pSrcElem    = LOADV(pSrcArrayPtr, {pIndexPhi});
 733         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
 734
 735         // GEP to this offset in dst
 736         Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
 737         pCurDst        = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
 738         STORE(pSrcElem, pCurDst);
 739
 740         // Update the mask
 741         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
 742
 743         // Terminator
 744         Value* pNewIndex = CTTZ(pNewMask, C(false));
 745
 746         pIsUndef = ICMP_EQ(pNewIndex, C(32));
 747         COND_BR(pIsUndef, pPostLoop, pLoop);
 748
 749         // Update phi edges
 750         pIndexPhi->addIncoming(pNewIndex, pLoop);
 751         pMaskPhi->addIncoming(pNewMask, pLoop);
 752
 753         // Move builder to beginning of post loop
 754         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
 755
 756         */
 757     }
 758 } // namespace SwrJit