src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file builder_misc.cpp
  24 *
  25 * @brief Implementation for miscellaneous builder functions
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder.h"
  32 #include "common/rdtsc_buckets.h"
  33
  34 #include <cstdarg>
  35
  36
  37 namespace SwrJit
  38 {
  39     void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
  40     {
  41         SWR_ASSERT(ptr->getType() != mInt64Ty, "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
  42     }
  43
  44     Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
  45     {
  46         std::vector<Value*> indices;
  47         for (auto i : indexList)
  48             indices.push_back(i);
  49         return GEPA(ptr, indices);
  50     }
  51
  52     Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
  53     {
  54         std::vector<Value*> indices;
  55         for (auto i : indexList)
  56             indices.push_back(C(i));
  57         return GEPA(ptr, indices);
  58     }
  59
  60     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
  61     {
  62         std::vector<Value*> indices;
  63         for (auto i : indexList)
  64             indices.push_back(i);
  65         return IN_BOUNDS_GEP(ptr, indices);
  66     }
  67
  68     Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
  69     {
  70         std::vector<Value*> indices;
  71         for (auto i : indexList)
  72             indices.push_back(C(i));
  73         return IN_BOUNDS_GEP(ptr, indices);
  74     }
  75
  76     LoadInst* Builder::LOAD(Value *Ptr, const char *Name, JIT_MEM_CLIENT usage)
  77     {
  78         AssertMemoryUsageParams(Ptr, usage);
  79         return IRB()->CreateLoad(Ptr, Name);
  80     }
  81
  82     LoadInst* Builder::LOAD(Value *Ptr, const Twine &Name, JIT_MEM_CLIENT usage)
  83     {
  84         AssertMemoryUsageParams(Ptr, usage);
  85         return IRB()->CreateLoad(Ptr, Name);
  86     }
  87
  88     LoadInst* Builder::LOAD(Type *Ty, Value *Ptr, const Twine &Name, JIT_MEM_CLIENT usage)
  89     {
  90         AssertMemoryUsageParams(Ptr, usage);
  91         return IRB()->CreateLoad(Ty, Ptr, Name);
  92     }
  93
  94     LoadInst* Builder::LOAD(Value *Ptr, bool isVolatile, const Twine &Name, JIT_MEM_CLIENT usage)
  95     {
  96         AssertMemoryUsageParams(Ptr, usage);
  97         return IRB()->CreateLoad(Ptr, isVolatile, Name);
  98     }
  99
 100     LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name, JIT_MEM_CLIENT usage)
 101     {
 102         AssertMemoryUsageParams(basePtr, usage);
 103         std::vector<Value*> valIndices;
 104         for (auto i : indices)
 105             valIndices.push_back(C(i));
 106         return LOAD(GEPA(basePtr, valIndices), name);
 107     }
 108
 109     LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
 110     {
 111         std::vector<Value*> valIndices;
 112         for (auto i : indices)
 113             valIndices.push_back(i);
 114         return LOAD(GEPA(basePtr, valIndices), name);
 115     }
 116
 117     StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
 118     {
 119         std::vector<Value*> valIndices;
 120         for (auto i : indices)
 121             valIndices.push_back(C(i));
 122         return STORE(val, GEPA(basePtr, valIndices));
 123     }
 124
 125     StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
 126     {
 127         std::vector<Value*> valIndices;
 128         for (auto i : indices)
 129             valIndices.push_back(i);
 130         return STORE(val, GEPA(basePtr, valIndices));
 131     }
 132
 133     Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset)
 134     {
 135         return GEP(base, offset);
 136     }
 137
 138     Value* Builder::MEM_ADD(Value* i32Incr, Value* basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
 139     {
 140         Value* i32Value = LOAD(GEP(basePtr, indices), name);
 141         Value* i32Result = ADD(i32Value, i32Incr);
 142         return STORE(i32Result, GEP(basePtr, indices));
 143     }
 144
 145     //////////////////////////////////////////////////////////////////////////
 146     /// @brief Generate a masked gather operation in LLVM IR.  If not
 147     /// supported on the underlying platform, emulate it with loads
 148     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 149     /// @param pBase - Int8* base VB address pointer value
 150     /// @param vIndices - SIMD wide value of VB byte offsets
 151     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 152     /// @param scale - value to scale indices by
 153     Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 154     {
 155         AssertMemoryUsageParams(pBase, usage);
 156
 157         return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
 158     }
 159
 160     Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 161     {
 162         AssertMemoryUsageParams(pBase, usage);
 163
 164         return VGATHERPS_16(vSrc, pBase, vIndices, vMask, C(scale));
 165     }
 166
 167     //////////////////////////////////////////////////////////////////////////
 168     /// @brief Generate a masked gather operation in LLVM IR.  If not
 169     /// supported on the underlying platform, emulate it with loads
 170     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 171     /// @param pBase - Int8* base VB address pointer value
 172     /// @param vIndices - SIMD wide value of VB byte offsets
 173     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 174     /// @param scale - value to scale indices by
 175     Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 176     {
 177         AssertMemoryUsageParams(pBase, usage);
 178
 179         return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
 180     }
 181
 182     Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
 183     {
 184         AssertMemoryUsageParams(pBase, usage);
 185
 186         return VGATHERDD_16(vSrc, pBase, vIndices, vMask, C(scale));
 187     }
 188
 189     //////////////////////////////////////////////////////////////////////////
 190     /// @brief Generate a masked gather operation in LLVM IR.  If not
 191     /// supported on the underlying platform, emulate it with loads
 192     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
 193     /// @param pBase - Int8* base VB address pointer value
 194     /// @param vIndices - SIMD wide value of VB byte offsets
 195     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
 196     /// @param scale - value to scale indices by
 197     Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
 198     {
 199         Value* vGather;
 200
 201         // use avx2 gather instruction if available
 202         if (JM()->mArch.AVX2())
 203         {
 204             vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
 205             vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 206         }
 207         else
 208         {
 209             Value* pStack = STACKSAVE();
 210
 211             // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
 212             Value* vSrcPtr = ALLOCA(vSrc->getType());
 213             STORE(vSrc, vSrcPtr);
 214
 215             vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
 216             Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
 217             Value *vOffsets = MUL(vIndices, vScaleVec);
 218             for (uint32_t i = 0; i < mVWidth / 2; ++i)
 219             {
 220                 // single component byte index
 221                 Value *offset = VEXTRACT(vOffsets, C(i));
 222                 // byte pointer to component
 223                 Value *loadAddress = GEP(pBase, offset);
 224                 loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
 225                 // pointer to the value to load if we're masking off a component
 226                 Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
 227                 Value *selMask = VEXTRACT(vMask, C(i));
 228                 // switch in a safe address to load if we're trying to access a vertex
 229                 Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
 230                 Value *val = LOAD(validAddress);
 231                 vGather = VINSERT(vGather, val, C(i));
 232             }
 233             STACKRESTORE(pStack);
 234         }
 235         return vGather;
 236     }
 237
 238     //////////////////////////////////////////////////////////////////////////
 239     /// @brief Alternative masked gather where source is a vector of pointers
 240     /// @param pVecSrcPtr   - SIMD wide vector of pointers
 241     /// @param pVecMask     - SIMD active lanes
 242     /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
 243     Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
 244     {
 245         return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
 246     }
 247
 248     void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
 249         Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
 250     {
 251         const SWR_FORMAT_INFO &info = GetFormatInfo(format);
 252         if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
 253         {
 254             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
 255         }
 256         else
 257         {
 258             GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
 259         }
 260     }
 261
 262     void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 263         Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
 264     {
 265         switch (info.bpp / info.numComps)
 266         {
 267         case 16:
 268         {
 269             Value* vGatherResult[2];
 270
 271             // TODO: vGatherMaskedVal
 272             Value* vGatherMaskedVal = VIMMED1((float)0);
 273
 274             // always have at least one component out of x or y to fetch
 275
 276             vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 277             // e.g. result of first 8x32bit integer gather for 16bit components
 278             // 256i - 0    1    2    3    4    5    6    7
 279             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 280             //
 281
 282             // if we have at least one component out of x or y to fetch
 283             if (info.numComps > 2)
 284             {
 285                 // offset base to the next components(zw) in the vertex to gather
 286                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 287
 288                 vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 289                 // e.g. result of second 8x32bit integer gather for 16bit components
 290                 // 256i - 0    1    2    3    4    5    6    7
 291                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 292                 //
 293             }
 294             else
 295             {
 296                 vGatherResult[1] = vGatherMaskedVal;
 297             }
 298
 299             // Shuffle gathered components into place, each row is a component
 300             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 301         }
 302         break;
 303         case 32:
 304         {
 305             // apply defaults
 306             for (uint32_t i = 0; i < 4; ++i)
 307             {
 308                 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
 309             }
 310
 311             for (uint32_t i = 0; i < info.numComps; i++)
 312             {
 313                 uint32_t swizzleIndex = info.swizzle[i];
 314
 315                 // Gather a SIMD of components
 316                 vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 317
 318                 // offset base to the next component to gather
 319                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 320             }
 321         }
 322         break;
 323         default:
 324             SWR_INVALID("Invalid float format");
 325             break;
 326         }
 327     }
 328
 329     void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
 330         Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
 331     {
 332         switch (info.bpp / info.numComps)
 333         {
 334         case 8:
 335         {
 336             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 337             Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 338             // e.g. result of an 8x32bit integer gather for 8bit components
 339             // 256i - 0    1    2    3    4    5    6    7
 340             //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 341
 342             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 343         }
 344         break;
 345         case 16:
 346         {
 347             Value* vGatherResult[2];
 348
 349             // TODO: vGatherMaskedVal
 350             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 351
 352             // always have at least one component out of x or y to fetch
 353
 354             vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 355             // e.g. result of first 8x32bit integer gather for 16bit components
 356             // 256i - 0    1    2    3    4    5    6    7
 357             //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 358             //
 359
 360             // if we have at least one component out of x or y to fetch
 361             if (info.numComps > 2)
 362             {
 363                 // offset base to the next components(zw) in the vertex to gather
 364                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 365
 366                 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
 367                 // e.g. result of second 8x32bit integer gather for 16bit components
 368                 // 256i - 0    1    2    3    4    5    6    7
 369                 //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 370                 //
 371             }
 372             else
 373             {
 374                 vGatherResult[1] = vGatherMaskedVal;
 375             }
 376
 377             // Shuffle gathered components into place, each row is a component
 378             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
 379
 380         }
 381         break;
 382         case 32:
 383         {
 384             // apply defaults
 385             for (uint32_t i = 0; i < 4; ++i)
 386             {
 387                 vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
 388             }
 389
 390             for (uint32_t i = 0; i < info.numComps; i++)
 391             {
 392                 uint32_t swizzleIndex = info.swizzle[i];
 393
 394                 // Gather a SIMD of components
 395                 vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 396
 397                 // offset base to the next component to gather
 398                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 399             }
 400         }
 401         break;
 402         default:
 403             SWR_INVALID("unsupported format");
 404             break;
 405         }
 406     }
 407
 408     void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
 409     {
 410         // cast types
 411         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 412         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 413
 414                                                                // input could either be float or int vector; do shuffle work in int
 415         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
 416         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
 417
 418         if (bPackedOutput)
 419         {
 420             Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 421
 422                                                                                                          // shuffle mask
 423             Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
 424                 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
 425             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
 426             // after pshufb: group components together in each 128bit lane
 427             // 256i - 0    1    2    3    4    5    6    7
 428             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 429
 430             Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
 431             // after PERMD: move and pack xy components into each 128bit lane
 432             // 256i - 0    1    2    3    4    5    6    7
 433             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
 434
 435             // do the same for zw components
 436             Value* vi128ZW = nullptr;
 437             if (info.numComps > 2)
 438             {
 439                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
 440                 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
 441             }
 442
 443             for (uint32_t i = 0; i < 4; i++)
 444             {
 445                 uint32_t swizzleIndex = info.swizzle[i];
 446                 // todo: fixed for packed
 447                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 448                 if (i >= info.numComps)
 449                 {
 450                     // set the default component val
 451                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 452                     continue;
 453                 }
 454
 455                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 456                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 457                 // if x or y, use vi128XY permute result, else use vi128ZW
 458                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 459
 460                 // extract packed component 128 bit lanes
 461                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 462             }
 463
 464         }
 465         else
 466         {
 467             // pshufb masks for each component
 468             Value* vConstMask[2];
 469             // x/z shuffle mask
 470             vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
 471                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
 472
 473             // y/w shuffle mask
 474             vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
 475                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
 476
 477
 478             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
 479             // apply defaults
 480             for (uint32_t i = 0; i < 4; ++i)
 481             {
 482                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 483             }
 484
 485             for (uint32_t i = 0; i < info.numComps; i++)
 486             {
 487                 uint32_t swizzleIndex = info.swizzle[i];
 488
 489                 // select correct constMask for x/z or y/w pshufb
 490                 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
 491                 // if x or y, use vi128XY permute result, else use vi128ZW
 492                 uint32_t selectedGather = (i < 2) ? 0 : 1;
 493
 494                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
 495                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
 496                 // 256i - 0    1    2    3    4    5    6    7
 497                 //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
 498             }
 499         }
 500     }
 501
 502     void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
 503     {
 504         // cast types
 505         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
 506         Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 507
 508         if (bPackedOutput)
 509         {
 510             Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 511                                                                                                       // shuffle mask
 512             Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
 513                 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
 514             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 515             // after pshufb: group components together in each 128bit lane
 516             // 256i - 0    1    2    3    4    5    6    7
 517             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 518
 519             Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
 520             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
 521             // 256i - 0    1    2    3    4    5    6    7
 522             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
 523
 524             // do the same for zw components
 525             Value* vi128ZW = nullptr;
 526             if (info.numComps > 2)
 527             {
 528                 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
 529             }
 530
 531             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
 532             for (uint32_t i = 0; i < 4; i++)
 533             {
 534                 uint32_t swizzleIndex = info.swizzle[i];
 535                 // todo: fix for packed
 536                 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
 537                 if (i >= info.numComps)
 538                 {
 539                     // set the default component val
 540                     vGatherOutput[swizzleIndex] = vGatherMaskedVal;
 541                     continue;
 542                 }
 543
 544                 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
 545                 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
 546                 // if x or y, use vi128XY permute result, else use vi128ZW
 547                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 548
 549                 // sign extend
 550                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
 551             }
 552         }
 553         // else zero extend
 554         else {
 555             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
 556             // apply defaults
 557             for (uint32_t i = 0; i < 4; ++i)
 558             {
 559                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
 560             }
 561
 562             for (uint32_t i = 0; i < info.numComps; i++) {
 563                 uint32_t swizzleIndex = info.swizzle[i];
 564
 565                 // pshufb masks for each component
 566                 Value* vConstMask;
 567                 switch (i)
 568                 {
 569                 case 0:
 570                     // x shuffle mask
 571                     vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
 572                         0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
 573                     break;
 574                 case 1:
 575                     // y shuffle mask
 576                     vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
 577                         1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
 578                     break;
 579                 case 2:
 580                     // z shuffle mask
 581                     vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
 582                         2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
 583                     break;
 584                 case 3:
 585                     // w shuffle mask
 586                     vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
 587                         3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
 588                     break;
 589                 default:
 590                     vConstMask = nullptr;
 591                     break;
 592                 }
 593
 594                 vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
 595                 // after pshufb for x channel
 596                 // 256i - 0    1    2    3    4    5    6    7
 597                 //        x000 x000 x000 x000 x000 x000 x000 x000
 598             }
 599         }
 600     }
 601
 602     //////////////////////////////////////////////////////////////////////////
 603     /// @brief emulates a scatter operation.
 604     /// @param pDst - pointer to destination
 605     /// @param vSrc - vector of src data to scatter
 606     /// @param vOffsets - vector of byte offsets from pDst
 607     /// @param vMask - mask of valid lanes
 608     void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
 609     {
 610         /* Scatter algorithm
 611
 612         while(Index = BitScanForward(mask))
 613         srcElem = srcVector[Index]
 614         offsetElem = offsetVector[Index]
 615         *(pDst + offsetElem) = srcElem
 616         Update mask (&= ~(1<<Index)
 617
 618         */
 619
 620         BasicBlock* pCurBB = IRB()->GetInsertBlock();
 621         Function* pFunc = pCurBB->getParent();
 622         Type* pSrcTy = vSrc->getType()->getVectorElementType();
 623
 624         // Store vectors on stack
 625         if (pScatterStackSrc == nullptr)
 626         {
 627             // Save off stack allocations and reuse per scatter. Significantly reduces stack
 628             // requirements for shaders with a lot of scatters.
 629             pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
 630             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
 631         }
 632
 633         Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
 634         Value* pOffsetsArrayPtr = pScatterStackOffsets;
 635         STORE(vSrc, pSrcArrayPtr);
 636         STORE(vOffsets, pOffsetsArrayPtr);
 637
 638         // Cast to pointers for random access
 639         pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
 640         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
 641
 642         Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
 643
 644         // Setup loop basic block
 645         BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
 646
 647         // compute first set bit
 648         Value* pIndex = CTTZ(pMask, C(false));
 649
 650         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
 651
 652         // Split current block
 653         BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
 654
 655         // Remove unconditional jump created by splitBasicBlock
 656         pCurBB->getTerminator()->eraseFromParent();
 657
 658         // Add terminator to end of original block
 659         IRB()->SetInsertPoint(pCurBB);
 660
 661         // Add conditional branch
 662         COND_BR(pIsUndef, pPostLoop, pLoop);
 663
 664         // Add loop basic block contents
 665         IRB()->SetInsertPoint(pLoop);
 666         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
 667         PHINode* pMaskPhi = PHI(mInt32Ty, 2);
 668
 669         pIndexPhi->addIncoming(pIndex, pCurBB);
 670         pMaskPhi->addIncoming(pMask, pCurBB);
 671
 672         // Extract elements for this index
 673         Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
 674         Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
 675
 676         // GEP to this offset in dst
 677         Value* pCurDst = GEP(pDst, pOffsetElem);
 678         pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
 679         STORE(pSrcElem, pCurDst);
 680
 681         // Update the mask
 682         Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
 683
 684         // Terminator
 685         Value* pNewIndex = CTTZ(pNewMask, C(false));
 686
 687         pIsUndef = ICMP_EQ(pNewIndex, C(32));
 688         COND_BR(pIsUndef, pPostLoop, pLoop);
 689
 690         // Update phi edges
 691         pIndexPhi->addIncoming(pNewIndex, pLoop);
 692         pMaskPhi->addIncoming(pNewMask, pLoop);
 693
 694         // Move builder to beginning of post loop
 695         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
 696     }
 697
 698 }