src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32 #include "common/rdtsc_buckets.h"
  33
  34 #include <cstdarg>
  35
  36
  37 namespace SwrJit
  38 {
  39     void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
  40     {
  41         SWR_ASSERT(ptr->getType() != mInt64Ty, "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
  42     }
  43
  44     Value *Builder::GEP(Value *Ptr, Value *Idx, Type *Ty, const Twine &Name)
  45     {
  46         return IRB()->CreateGEP(Ptr, Idx, Name);
  47     }
  48
  49     Value *Builder::GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name)
  50     {
  51         return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
  52     }
  53
  54     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList, Type *Ty)
  55     {
  56         std::vector<Value*> indices;
  57         for (auto i : indexList)
  58             indices.push_back(i);
  59         return GEPA(ptr, indices);
  60     }
  61
  62     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList, Type *Ty)
  63     {
  64         std::vector<Value*> indices;
  65         for (auto i : indexList)
  66             indices.push_back(C(i));
  67         return GEPA(ptr, indices);
  68     }
  69
  70     Value *Builder::GEPA(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name)
  71     {
  72         return IRB()->CreateGEP(Ptr, IdxList, Name);
  73     }
  74
  75     Value *Builder::GEPA(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name)
  76     {
  77         return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
  78     }
  79
  80     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
  81     {
  82         std::vector<Value*> indices;
  83         for (auto i : indexList)
  84             indices.push_back(i);
  85         return IN_BOUNDS_GEP(ptr, indices);
  86     }
  87
  88     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
  89     {
  90         std::vector<Value*> indices;
  91         for (auto i : indexList)
  92             indices.push_back(C(i));
  93         return IN_BOUNDS_GEP(ptr, indices);
  94     }
  95
  96     LoadInst* Builder::LOAD(Value *Ptr, const char *Name, Type *Ty, JIT_MEM_CLIENT usage)
  97     {
  98         AssertMemoryUsageParams(Ptr, usage);
  99         return IRB()->CreateLoad(Ptr, Name);
 100     }
 101
 102     LoadInst* Builder::LOAD(Value *Ptr, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage)
 103     {
 104         AssertMemoryUsageParams(Ptr, usage);
 105         return IRB()->CreateLoad(Ptr, Name);
 106     }
 107
 108     LoadInst* Builder::LOAD(Type *Ty, Value *Ptr, const Twine &Name, JIT_MEM_CLIENT usage)
 109     {
 110         AssertMemoryUsageParams(Ptr, usage);
 111         return IRB()->CreateLoad(Ty, Ptr, Name);
 112     }
 113
 114     LoadInst* Builder::LOAD(Value *Ptr, bool isVolatile, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage)
 115     {
 116         AssertMemoryUsageParams(Ptr, usage);
 117         return IRB()->CreateLoad(Ptr, isVolatile, Name);
 118     }
 119
 120     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name, Type *Ty, JIT_MEM_CLIENT usage)
 121     {
 122         std::vector<Value*> valIndices;
 123         for (auto i : indices)
 124             valIndices.push_back(C(i));
 125         return Builder::LOAD(GEPA(basePtr, valIndices), name);
 126     }
 127
 128     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 129     {
 130         std::vector<Value*> valIndices;
 131         for (auto i : indices)
 132             valIndices.push_back(i);
 133         return LOAD(GEPA(basePtr, valIndices), name);
 134     }
 135
 136     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 137     {
 138         std::vector<Value*> valIndices;
 139         for (auto i : indices)
 140             valIndices.push_back(C(i));
 141         return STORE(val, GEPA(basePtr, valIndices));
 142     }
 143
 144     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 145     {
 146         std::vector<Value*> valIndices;
 147         for (auto i : indices)
 148             valIndices.push_back(i);
 149         return STORE(val, GEPA(basePtr, valIndices));
 150     }
 151
 152     Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset)
 153     {
 154         return GEP(base, offset);
 155     }
 156
 157     Value* Builder::MEM_ADD(Value* i32Incr, Value* basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 158     {
 159         Value* i32Value = LOAD(GEP(basePtr, indices), name);
 160         Value* i32Result = ADD(i32Value, i32Incr);
 161         return STORE(i32Result, GEP(basePtr, indices));
 162     }
 163
 164     //////////////////////////////////////////////////////////////////////////
 165     /// @brief Generate a masked gather operation in LLVM IR.  If not
 166     /// supported on the underlying platform, emulate it with loads
 167     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 168     /// @param pBase - Int8* base VB address pointer value
 169     /// @param vIndices - SIMD wide value of VB byte offsets
 170     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 171     /// @param scale - value to scale indices by
 172     Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 173     {
 174         AssertMemoryUsageParams(pBase, usage);
 175
 176         return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
 177     }
 178
 179     //////////////////////////////////////////////////////////////////////////
 180     /// @brief Generate a masked gather operation in LLVM IR.  If not
 181     /// supported on the underlying platform, emulate it with loads
 182     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 183     /// @param pBase - Int8* base VB address pointer value
 184     /// @param vIndices - SIMD wide value of VB byte offsets
 185     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 186     /// @param scale - value to scale indices by
 187     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 188     {
 189         AssertMemoryUsageParams(pBase, usage);
 190
 191         return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
 192     }
 193
 194     //////////////////////////////////////////////////////////////////////////
 195     /// @brief Generate a masked gather operation in LLVM IR.  If not
 196     /// supported on the underlying platform, emulate it with loads
 197     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 198     /// @param pBase - Int8* base VB address pointer value
 199     /// @param vIndices - SIMD wide value of VB byte offsets
 200     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 201     /// @param scale - value to scale indices by
 202     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 203     {
 204         return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 205     }
 206
 207     //////////////////////////////////////////////////////////////////////////
 208     /// @brief Alternative masked gather where source is a vector of pointers
 209     /// @param pVecSrcPtr   - SIMD wide vector of pointers
 210     /// @param pVecMask     - SIMD active lanes
 211     /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
 212     Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
 213     {
 214         return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
 215     }
 216
 217     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 218         Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
 219     {
 220         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 221         if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 222         {
 223             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
 224         }
 225         else
 226         {
 227             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
 228         }
 229     }
 230
 231     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 232         Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
 233     {
 234         switch (info.bpp / info.numComps)
 235         {
 236         case 16:
 237         {
 238             Value* vGatherResult[2];
 239
 240             // TODO: vGatherMaskedVal
 241             Value* vGatherMaskedVal = VIMMED1((float)0);
 242
 243             // always have at least one component out of x or y to fetch
 244
 245             vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 246             // e.g. result of first 8x32bit integer gather for 16bit components
 247             // 256i - 0    1    2    3    4    5    6    7
 248             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 249             //
 250
 251             // if we have at least one component out of x or y to fetch
 252             if (info.numComps > 2)
 253             {
 254                 // offset base to the next components(zw) in the vertex to gather
 255                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 256
 257                 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 258                 // e.g. result of second 8x32bit integer gather for 16bit components
 259                 // 256i - 0    1    2    3    4    5    6    7
 260                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 261                 //
 262             }
 263             else
 264             {
 265                 vGatherResult[1] = vGatherMaskedVal;
 266             }
 267
 268             // Shuffle gathered components into place, each row is a component
 269             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 270         }
 271         break;
 272         case 32:
 273         {
 274             // apply defaults
 275             for (uint32_t i = 0; i < 4; ++i)
 276             {
 277                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
 278             }
 279
 280             for (uint32_t i = 0; i < info.numComps; i++)
 281             {
 282                 uint32_t swizzleIndex = info.swizzle[i];
 283
 284                 // Gather a SIMD of components
 285                 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 286
 287                 // offset base to the next component to gather
 288                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 289             }
 290         }
 291         break;
 292         default:
 293             SWR_INVALID("Invalid float format");
 294             break;
 295         }
 296     }
 297
 298     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 299         Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
 300     {
 301         switch (info.bpp / info.numComps)
 302         {
 303         case 8:
 304         {
 305             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 306             Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 307             // e.g. result of an 8x32bit integer gather for 8bit components
 308             // 256i - 0    1    2    3    4    5    6    7
 309             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 310
 311             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 312         }
 313         break;
 314         case 16:
 315         {
 316             Value* vGatherResult[2];
 317
 318             // TODO: vGatherMaskedVal
 319             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 320
 321             // always have at least one component out of x or y to fetch
 322
 323             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 324             // e.g. result of first 8x32bit integer gather for 16bit components
 325             // 256i - 0    1    2    3    4    5    6    7
 326             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 327             //
 328
 329             // if we have at least one component out of x or y to fetch
 330             if (info.numComps > 2)
 331             {
 332                 // offset base to the next components(zw) in the vertex to gather
 333                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 334
 335                 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 336                 // e.g. result of second 8x32bit integer gather for 16bit components
 337                 // 256i - 0    1    2    3    4    5    6    7
 338                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 339                 //
 340             }
 341             else
 342             {
 343                 vGatherResult[1] = vGatherMaskedVal;
 344             }
 345
 346             // Shuffle gathered components into place, each row is a component
 347             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 348
 349         }
 350         break;
 351         case 32:
 352         {
 353             // apply defaults
 354             for (uint32_t i = 0; i < 4; ++i)
 355             {
 356                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
 357             }
 358
 359             for (uint32_t i = 0; i < info.numComps; i++)
 360             {
 361                 uint32_t swizzleIndex = info.swizzle[i];
 362
 363                 // Gather a SIMD of components
 364                 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 365
 366                 // offset base to the next component to gather
 367                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 368             }
 369         }
 370         break;
 371         default:
 372             SWR_INVALID("unsupported format");
 373             break;
 374         }
 375     }
 376
 377     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
 378     {
 379         // cast types
 380         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 381         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 382
 383                                                                // input could either be float or int vector; do shuffle work in int
 384         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
 385         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
 386
 387         if (bPackedOutput)
 388         {
 389             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 390
 391                                                                                                          // shuffle mask
 392             Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 393                 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
 394             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
 395             // after pshufb: group components together in each 128bit lane
 396             // 256i - 0    1    2    3    4    5    6    7
 397             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 398
 399             Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
 400             // after PERMD: move and pack xy components into each 128bit lane
 401             // 256i - 0    1    2    3    4    5    6    7
 402             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
 403
 404             // do the same for zw components
 405             Value* vi128ZW = nullptr;
 406             if (info.numComps > 2)
 407             {
 408                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
 409                 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
 410             }
 411
 412             for (uint32_t i = 0; i < 4; i++)
 413             {
 414                 uint32_t swizzleIndex = info.swizzle[i];
 415                 // todo: fixed for packed
 416                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 417                 if (i >= info.numComps)
 418                 {
 419                     // set the default component val
 420                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 421                     continue;
 422                 }
 423
 424                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 425                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 426                 // if x or y, use vi128XY permute result, else use vi128ZW
 427                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 428
 429                 // extract packed component 128 bit lanes
 430                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 431             }
 432
 433         }
 434         else
 435         {
 436             // pshufb masks for each component
 437             Value* vConstMask[2];
 438             // x/z shuffle mask
 439             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
 440                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
 441
 442             // y/w shuffle mask
 443             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
 444                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
 445
 446
 447             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
 448             // apply defaults
 449             for (uint32_t i = 0; i < 4; ++i)
 450             {
 451                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 452             }
 453
 454             for (uint32_t i = 0; i < info.numComps; i++)
 455             {
 456                 uint32_t swizzleIndex = info.swizzle[i];
 457
 458                 // select correct constMask for x/z or y/w pshufb
 459                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
 460                 // if x or y, use vi128XY permute result, else use vi128ZW
 461                 uint32_t selectedGather = (i < 2) ? 0 : 1;
 462
 463                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
 464                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
 465                 // 256i - 0    1    2    3    4    5    6    7
 466                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
 467             }
 468         }
 469     }
 470
 471     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
 472     {
 473         // cast types
 474         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 475         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 476
 477         if (bPackedOutput)
 478         {
 479             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 480                                                                                                       // shuffle mask
 481             Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 482                 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
 483             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 484             // after pshufb: group components together in each 128bit lane
 485             // 256i - 0    1    2    3    4    5    6    7
 486             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 487
 488             Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
 489             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
 490             // 256i - 0    1    2    3    4    5    6    7
 491             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
 492
 493             // do the same for zw components
 494             Value* vi128ZW = nullptr;
 495             if (info.numComps > 2)
 496             {
 497                 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
 498             }
 499
 500             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
 501             for (uint32_t i = 0; i < 4; i++)
 502             {
 503                 uint32_t swizzleIndex = info.swizzle[i];
 504                 // todo: fix for packed
 505                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 506                 if (i >= info.numComps)
 507                 {
 508                     // set the default component val
 509                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 510                     continue;
 511                 }
 512
 513                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 514                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 515                 // if x or y, use vi128XY permute result, else use vi128ZW
 516                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 517
 518                 // sign extend
 519                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 520             }
 521         }
 522         // else zero extend
 523         else {
 524             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
 525             // apply defaults
 526             for (uint32_t i = 0; i < 4; ++i)
 527             {
 528                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 529             }
 530
 531             for (uint32_t i = 0; i < info.numComps; i++) {
 532                 uint32_t swizzleIndex = info.swizzle[i];
 533
 534                 // pshufb masks for each component
 535                 Value* vConstMask;
 536                 switch (i)
 537                 {
 538                 case 0:
 539                     // x shuffle mask
 540                     vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
 541                         0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
 542                     break;
 543                 case 1:
 544                     // y shuffle mask
 545                     vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
 546                         1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
 547                     break;
 548                 case 2:
 549                     // z shuffle mask
 550                     vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
 551                         2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
 552                     break;
 553                 case 3:
 554                     // w shuffle mask
 555                     vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
 556                         3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
 557                     break;
 558                 default:
 559                     vConstMask = nullptr;
 560                     break;
 561                 }
 562
 563                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 564                 // after pshufb for x channel
 565                 // 256i - 0    1    2    3    4    5    6    7
 566                 //        x000 x000 x000 x000 x000 x000 x000 x000
 567             }
 568         }
 569     }
 570
 571     //////////////////////////////////////////////////////////////////////////
 572     /// @brief emulates a scatter operation.
 573     /// @param pDst - pointer to destination
 574     /// @param vSrc - vector of src data to scatter
 575     /// @param vOffsets - vector of byte offsets from pDst
 576     /// @param vMask - mask of valid lanes
 577     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
 578     {
 579         /* Scatter algorithm
 580
 581         while(Index = BitScanForward(mask))
 582         srcElem = srcVector[Index]
 583         offsetElem = offsetVector[Index]
 584         *(pDst + offsetElem) = srcElem
 585         Update mask (&= ~(1<<Index)
 586
 587         */
 588
 589         BasicBlock* pCurBB = IRB()->GetInsertBlock();
 590         Function* pFunc = pCurBB->getParent();
 591         Type* pSrcTy = vSrc->getType()->getVectorElementType();
 592
 593         // Store vectors on stack
 594         if (pScatterStackSrc == nullptr)
 595         {
 596             // Save off stack allocations and reuse per scatter. Significantly reduces stack
 597             // requirements for shaders with a lot of scatters.
 598             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
 599             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
 600         }
 601
 602         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
 603         Value* pOffsetsArrayPtr = pScatterStackOffsets;
 604         STORE(vSrc, pSrcArrayPtr);
 605         STORE(vOffsets, pOffsetsArrayPtr);
 606
 607         // Cast to pointers for random access
 608         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
 609         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
 610
 611         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
 612
 613         // Setup loop basic block
 614         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
 615
 616         // compute first set bit
 617         Value* pIndex = CTTZ(pMask, C(false));
 618
 619         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
 620
 621         // Split current block
 622         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
 623
 624         // Remove unconditional jump created by splitBasicBlock
 625         pCurBB->getTerminator()->eraseFromParent();
 626
 627         // Add terminator to end of original block
 628         IRB()->SetInsertPoint(pCurBB);
 629
 630         // Add conditional branch
 631         COND_BR(pIsUndef, pPostLoop, pLoop);
 632
 633         // Add loop basic block contents
 634         IRB()->SetInsertPoint(pLoop);
 635         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
 636         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
 637
 638         pIndexPhi->addIncoming(pIndex, pCurBB);
 639         pMaskPhi->addIncoming(pMask, pCurBB);
 640
 641         // Extract elements for this index
 642         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
 643         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
 644
 645         // GEP to this offset in dst
 646         Value* pCurDst = GEP(pDst, pOffsetElem);
 647         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
 648         STORE(pSrcElem, pCurDst);
 649
 650         // Update the mask
 651         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
 652
 653         // Terminator
 654         Value* pNewIndex = CTTZ(pNewMask, C(false));
 655
 656         pIsUndef = ICMP_EQ(pNewIndex, C(32));
 657         COND_BR(pIsUndef, pPostLoop, pLoop);
 658
 659         // Update phi edges
 660         pIndexPhi->addIncoming(pNewIndex, pLoop);
 661         pMaskPhi->addIncoming(pNewMask, pLoop);
 662
 663         // Move builder to beginning of post loop
 664         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
 665     }
 666 }