From: George Kyriazis <george.kyriazis@intel.com>
Date: Tue, 23 Jan 2018 01:23:32 +0000 (-0600)
Subject: swr/rast: Move memory-related JIT functions
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=36dbbf11a0afda57940b7ca224f318dd3915d8c5;p=mesa.git

swr/rast: Move memory-related JIT functions

Move them to their own file (builder_mem.{h|cpp}).  Add builder_mem.cpp
to the build system.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
---

diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources
index cd2040e1371..cbf73953911 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -140,6 +140,8 @@ JITTER_CXX_SOURCES := \
 	rasterizer/jitter/builder.cpp \
 	rasterizer/jitter/builder.h \
 	rasterizer/jitter/builder_math.h \
+	rasterizer/jitter/builder_mem.cpp \
+	rasterizer/jitter/builder_mem.h \
 	rasterizer/jitter/builder_misc.cpp \
 	rasterizer/jitter/builder_misc.h \
 	rasterizer/jitter/fetch_jit.cpp \
diff --git a/src/gallium/drivers/swr/meson.build b/src/gallium/drivers/swr/meson.build
index 8dffb4c000d..ae86c8ec380 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -68,6 +68,8 @@ files_swr_mesa = files(
   'rasterizer/jitter/builder.cpp',
   'rasterizer/jitter/builder.h',
   'rasterizer/jitter/builder_math.h',
+  'rasterizer/jitter/builder_mem.cpp',
+  'rasterizer/jitter/builder_mem.h',
   'rasterizer/jitter/builder_misc.cpp',
   'rasterizer/jitter/builder_misc.h',
   'rasterizer/jitter/fetch_jit.cpp',
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 2e714f97380..5d1a6b9273c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -94,5 +94,6 @@ namespace SwrJit
 #include "gen_builder_x86.hpp"
 #include "builder_misc.h"
 #include "builder_math.h"
+#include "builder_mem.h"
     };
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
new file mode 100644
index 00000000000..c8ba5b465e2
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -0,0 +1,816 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder_misc.cpp
+*
+* @brief Implementation for miscellaneous builder functions
+*
+* Notes:
+*
+******************************************************************************/
+#include "jit_pch.hpp"
+#include "builder.h"
+#include "common/rdtsc_buckets.h"
+
+#include <cstdarg>
+
+
+namespace SwrJit
+{
+
+    Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(i);
+        return GEPA(ptr, indices);
+    }
+
+    Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(C(i));
+        return GEPA(ptr, indices);
+    }
+
+    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(i);
+        return IN_BOUNDS_GEP(ptr, indices);
+    }
+
+    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+    {
+        std::vector<Value*> indices;
+        for (auto i : indexList)
+            indices.push_back(C(i));
+        return IN_BOUNDS_GEP(ptr, indices);
+    }
+
+    LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(C(i));
+        return LOAD(GEPA(basePtr, valIndices), name);
+    }
+
+    LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(i);
+        return LOAD(GEPA(basePtr, valIndices), name);
+    }
+
+    StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(C(i));
+        return STORE(val, GEPA(basePtr, valIndices));
+    }
+
+    StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
+    {
+        std::vector<Value*> valIndices;
+        for (auto i : indices)
+            valIndices.push_back(i);
+        return STORE(val, GEPA(basePtr, valIndices));
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate an i32 masked load operation in LLVM IR.  If not  
+    /// supported on the underlying platform, emulate it with float masked load
+    /// @param src - base address pointer for the load
+    /// @param vMask - SIMD wide mask that controls whether to access memory load 0
+    Value *Builder::MASKLOADD(Value* src, Value* mask)
+    {
+        Value* vResult;
+        // use avx2 gather instruction is available
+        if (JM()->mArch.AVX2())
+        {
+            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
+            vResult = CALL(func, { src,mask });
+        }
+        else
+        {
+            // maskload intrinsic expects integer mask operand in llvm >= 3.8
+#if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
+            mask = BITCAST(mask, VectorType::get(mInt32Ty, mVWidth));
+#else
+            mask = BITCAST(mask, VectorType::get(mFP32Ty, mVWidth));
+#endif
+            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256);
+            vResult = BITCAST(CALL(func, { src,mask }), VectorType::get(mInt32Ty, mVWidth));
+        }
+        return vResult;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a masked gather operation in LLVM IR.  If not  
+    /// supported on the underlying platform, emulate it with loads
+    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+    /// @param pBase - Int8* base VB address pointer value
+    /// @param vIndices - SIMD wide value of VB byte offsets
+    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+    /// @param scale - value to scale indices by
+    Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, Value *pDrawContext)
+    {
+        Value *vGather;
+
+        // use avx2 gather instruction if available
+        if (JM()->mArch.AVX2())
+        {
+            // force mask to <N x float>, required by vgather
+            Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
+
+            vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
+        }
+        else
+        {
+            Value* pStack = STACKSAVE();
+
+            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
+            Value* vSrcPtr = ALLOCA(vSrc->getType());
+            STORE(vSrc, vSrcPtr);
+
+            vGather = VUNDEF_F();
+            Value *vScaleVec = VIMMED1((uint32_t)scale);
+            Value *vOffsets = MUL(vIndices, vScaleVec);
+            for (uint32_t i = 0; i < mVWidth; ++i)
+            {
+                // single component byte index
+                Value *offset = VEXTRACT(vOffsets, C(i));
+                // byte pointer to component
+                Value *loadAddress = GEP(pBase, offset);
+                loadAddress = BITCAST(loadAddress, PointerType::get(mFP32Ty, 0));
+                // pointer to the value to load if we're masking off a component
+                Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
+                Value *selMask = VEXTRACT(vMask, C(i));
+                // switch in a safe address to load if we're trying to access a vertex 
+                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+                Value *val = LOAD(validAddress);
+                vGather = VINSERT(vGather, val, C(i));
+            }
+
+            STACKRESTORE(pStack);
+        }
+
+        return vGather;
+    }
+
+    Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
+    {
+        Value *vGather = VUNDEF_F_16();
+
+        // use AVX512F gather instruction if available
+        if (JM()->mArch.AVX512F())
+        {
+            // force mask to <N-bit Integer>, required by vgather2
+            Value *mask = BITCAST(vMask, mInt16Ty);
+
+            vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
+        }
+        else
+        {
+            Value *src0 = EXTRACT_16(vSrc, 0);
+            Value *src1 = EXTRACT_16(vSrc, 1);
+
+            Value *indices0 = EXTRACT_16(vIndices, 0);
+            Value *indices1 = EXTRACT_16(vIndices, 1);
+
+            Value *mask0 = EXTRACT_16(vMask, 0);
+            Value *mask1 = EXTRACT_16(vMask, 1);
+
+            Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
+            Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
+
+            vGather = JOIN_16(gather0, gather1);
+        }
+
+        return vGather;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a masked gather operation in LLVM IR.  If not  
+    /// supported on the underlying platform, emulate it with loads
+    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+    /// @param pBase - Int8* base VB address pointer value
+    /// @param vIndices - SIMD wide value of VB byte offsets
+    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+    /// @param scale - value to scale indices by
+    Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
+    {
+        Value* vGather;
+
+        // use avx2 gather instruction if available
+        if (JM()->mArch.AVX2())
+        {
+            vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
+        }
+        else
+        {
+            Value* pStack = STACKSAVE();
+
+            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
+            Value* vSrcPtr = ALLOCA(vSrc->getType());
+            STORE(vSrc, vSrcPtr);
+
+            vGather = VUNDEF_I();
+            Value *vScaleVec = VIMMED1((uint32_t)scale);
+            Value *vOffsets = MUL(vIndices, vScaleVec);
+            for (uint32_t i = 0; i < mVWidth; ++i)
+            {
+                // single component byte index
+                Value *offset = VEXTRACT(vOffsets, C(i));
+                // byte pointer to component
+                Value *loadAddress = GEP(pBase, offset);
+                loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
+                // pointer to the value to load if we're masking off a component
+                Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
+                Value *selMask = VEXTRACT(vMask, C(i));
+                // switch in a safe address to load if we're trying to access a vertex 
+                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+                Value *val = LOAD(validAddress, C(0));
+                vGather = VINSERT(vGather, val, C(i));
+            }
+
+            STACKRESTORE(pStack);
+        }
+
+        return vGather;
+    }
+
+    Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
+    {
+        Value *vGather = VUNDEF_I_16();
+
+        // use AVX512F gather instruction if available
+        if (JM()->mArch.AVX512F())
+        {
+            // force mask to <N-bit Integer>, required by vgather2
+            Value *mask = BITCAST(vMask, mInt16Ty);
+
+            vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
+        }
+        else
+        {
+            Value *src0 = EXTRACT_16(vSrc, 0);
+            Value *src1 = EXTRACT_16(vSrc, 1);
+
+            Value *indices0 = EXTRACT_16(vIndices, 0);
+            Value *indices1 = EXTRACT_16(vIndices, 1);
+
+            Value *mask0 = EXTRACT_16(vMask, 0);
+            Value *mask1 = EXTRACT_16(vMask, 1);
+
+            Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
+            Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
+
+            vGather = JOIN_16(gather0, gather1);
+        }
+
+        return vGather;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a masked gather operation in LLVM IR.  If not
+    /// supported on the underlying platform, emulate it with loads
+    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+    /// @param pBase - Int8* base VB address pointer value
+    /// @param vIndices - SIMD wide value of VB byte offsets
+    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+    /// @param scale - value to scale indices by
+    Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
+    {
+        Value* vGather;
+
+        // use avx2 gather instruction if available
+        if (JM()->mArch.AVX2())
+        {
+            vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
+            vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
+        }
+        else
+        {
+            Value* pStack = STACKSAVE();
+
+            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
+            Value* vSrcPtr = ALLOCA(vSrc->getType());
+            STORE(vSrc, vSrcPtr);
+
+            vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
+            Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
+            Value *vOffsets = MUL(vIndices, vScaleVec);
+            for (uint32_t i = 0; i < mVWidth / 2; ++i)
+            {
+                // single component byte index
+                Value *offset = VEXTRACT(vOffsets, C(i));
+                // byte pointer to component
+                Value *loadAddress = GEP(pBase, offset);
+                loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
+                // pointer to the value to load if we're masking off a component
+                Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
+                Value *selMask = VEXTRACT(vMask, C(i));
+                // switch in a safe address to load if we're trying to access a vertex
+                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+                Value *val = LOAD(validAddress);
+                vGather = VINSERT(vGather, val, C(i));
+            }
+            STACKRESTORE(pStack);
+        }
+        return vGather;
+    }
+
+    void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
+        Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+    {
+        const SWR_FORMAT_INFO &info = GetFormatInfo(format);
+        if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
+        {
+            GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+        }
+        else
+        {
+            GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+        }
+    }
+
+    void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+        Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
+    {
+        switch (info.bpp / info.numComps)
+        {
+        case 16:
+        {
+            Value* vGatherResult[2];
+
+            // TODO: vGatherMaskedVal
+            Value* vGatherMaskedVal = VIMMED1((float)0);
+
+            // always have at least one component out of x or y to fetch
+
+            vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+            // e.g. result of first 8x32bit integer gather for 16bit components
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+            //
+
+            // if we have at least one component out of x or y to fetch
+            if (info.numComps > 2)
+            {
+                // offset base to the next components(zw) in the vertex to gather
+                pSrcBase = GEP(pSrcBase, C((char)4));
+
+                vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+                // e.g. result of second 8x32bit integer gather for 16bit components
+                // 256i - 0    1    2    3    4    5    6    7
+                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                //
+            }
+            else
+            {
+                vGatherResult[1] = vGatherMaskedVal;
+            }
+
+            // Shuffle gathered components into place, each row is a component
+            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+        }
+        break;
+        case 32:
+        {
+            // apply defaults
+            for (uint32_t i = 0; i < 4; ++i)
+            {
+                vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
+            }
+
+            for (uint32_t i = 0; i < info.numComps; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+
+                // Gather a SIMD of components
+                vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
+
+                // offset base to the next component to gather
+                pSrcBase = GEP(pSrcBase, C((char)4));
+            }
+        }
+        break;
+        default:
+            SWR_INVALID("Invalid float format");
+            break;
+        }
+    }
+
+    void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+        Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
+    {
+        switch (info.bpp / info.numComps)
+        {
+        case 8:
+        {
+            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
+            Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+            // e.g. result of an 8x32bit integer gather for 8bit components
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
+
+            Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+        }
+        break;
+        case 16:
+        {
+            Value* vGatherResult[2];
+
+            // TODO: vGatherMaskedVal
+            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
+
+            // always have at least one component out of x or y to fetch
+
+            vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+            // e.g. result of first 8x32bit integer gather for 16bit components
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+            //
+
+            // if we have at least one component out of x or y to fetch
+            if (info.numComps > 2)
+            {
+                // offset base to the next components(zw) in the vertex to gather
+                pSrcBase = GEP(pSrcBase, C((char)4));
+
+                vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
+                // e.g. result of second 8x32bit integer gather for 16bit components
+                // 256i - 0    1    2    3    4    5    6    7
+                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                //
+            }
+            else
+            {
+                vGatherResult[1] = vGatherMaskedVal;
+            }
+
+            // Shuffle gathered components into place, each row is a component
+            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+
+        }
+        break;
+        case 32:
+        {
+            // apply defaults
+            for (uint32_t i = 0; i < 4; ++i)
+            {
+                vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
+            }
+
+            for (uint32_t i = 0; i < info.numComps; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+
+                // Gather a SIMD of components
+                vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
+
+                // offset base to the next component to gather
+                pSrcBase = GEP(pSrcBase, C((char)4));
+            }
+        }
+        break;
+        default:
+            SWR_INVALID("unsupported format");
+            break;
+        }
+    }
+
+    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
+    {
+        // cast types
+        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+        Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+                                                               // input could either be float or int vector; do shuffle work in int
+        vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
+        vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
+
+        if (bPackedOutput)
+        {
+            Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+
+                                                                                                         // shuffle mask
+            Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
+            // after pshufb: group components together in each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
+
+            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            // after PERMD: move and pack xy components into each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
+
+            // do the same for zw components
+            Value* vi128ZW = nullptr;
+            if (info.numComps > 2)
+            {
+                Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
+                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            }
+
+            for (uint32_t i = 0; i < 4; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+                // todo: fixed for packed
+                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+                if (i >= info.numComps)
+                {
+                    // set the default component val
+                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+                    continue;
+                }
+
+                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+                // extract packed component 128 bit lanes 
+                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+            }
+
+        }
+        else
+        {
+            // pshufb masks for each component
+            Value* vConstMask[2];
+            // x/z shuffle mask
+            vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+
+            // y/w shuffle mask
+            vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+                2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
+
+
+            // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
+            // apply defaults
+            for (uint32_t i = 0; i < 4; ++i)
+            {
+                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
+            }
+
+            for (uint32_t i = 0; i < info.numComps; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+
+                // select correct constMask for x/z or y/w pshufb
+                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                uint32_t selectedGather = (i < 2) ? 0 : 1;
+
+                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+                // after pshufb mask for x channel; z uses the same shuffle from the second gather
+                // 256i - 0    1    2    3    4    5    6    7
+                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
+            }
+        }
+    }
+
+    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
+    {
+        // cast types
+        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+        Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+        if (bPackedOutput)
+        {
+            Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+                                                                                                      // shuffle mask
+            Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+            // after pshufb: group components together in each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
+
+            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
+
+            // do the same for zw components
+            Value* vi128ZW = nullptr;
+            if (info.numComps > 2)
+            {
+                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+            }
+
+            // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+            for (uint32_t i = 0; i < 4; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+                // todo: fix for packed
+                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+                if (i >= info.numComps)
+                {
+                    // set the default component val
+                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+                    continue;
+                }
+
+                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+                // sign extend
+                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+            }
+        }
+        // else zero extend
+        else {
+            // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
+            // apply defaults
+            for (uint32_t i = 0; i < 4; ++i)
+            {
+                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
+            }
+
+            for (uint32_t i = 0; i < info.numComps; i++) {
+                uint32_t swizzleIndex = info.swizzle[i];
+
+                // pshufb masks for each component
+                Value* vConstMask;
+                switch (i)
+                {
+                case 0:
+                    // x shuffle mask
+                    vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+                        0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
+                    break;
+                case 1:
+                    // y shuffle mask
+                    vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+                        1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
+                    break;
+                case 2:
+                    // z shuffle mask
+                    vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+                        2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
+                    break;
+                case 3:
+                    // w shuffle mask
+                    vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+                        3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
+                    break;
+                default:
+                    vConstMask = nullptr;
+                    break;
+                }
+
+                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+                // after pshufb for x channel
+                // 256i - 0    1    2    3    4    5    6    7
+                //        x000 x000 x000 x000 x000 x000 x000 x000 
+            }
+        }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief emulates a scatter operation.
+    /// @param pDst - pointer to destination 
+    /// @param vSrc - vector of src data to scatter
+    /// @param vOffsets - vector of byte offsets from pDst
+    /// @param vMask - mask of valid lanes
+    void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
+    {
+        /* Scatter algorithm
+
+        while(Index = BitScanForward(mask))
+        srcElem = srcVector[Index]
+        offsetElem = offsetVector[Index]
+        *(pDst + offsetElem) = srcElem
+        Update mask (&= ~(1<<Index)
+
+        */
+
+        BasicBlock* pCurBB = IRB()->GetInsertBlock();
+        Function* pFunc = pCurBB->getParent();
+        Type* pSrcTy = vSrc->getType()->getVectorElementType();
+
+        // Store vectors on stack
+        if (pScatterStackSrc == nullptr)
+        {
+            // Save off stack allocations and reuse per scatter. Significantly reduces stack
+            // requirements for shaders with a lot of scatters.
+            pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
+            pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
+        }
+
+        Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
+        Value* pOffsetsArrayPtr = pScatterStackOffsets;
+        STORE(vSrc, pSrcArrayPtr);
+        STORE(vOffsets, pOffsetsArrayPtr);
+
+        // Cast to pointers for random access
+        pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
+        pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
+
+        Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
+
+        // Get cttz function
+        Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
+
+        // Setup loop basic block
+        BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
+
+        // compute first set bit
+        Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
+
+        Value* pIsUndef = ICMP_EQ(pIndex, C(32));
+
+        // Split current block
+        BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
+
+        // Remove unconditional jump created by splitBasicBlock
+        pCurBB->getTerminator()->eraseFromParent();
+
+        // Add terminator to end of original block
+        IRB()->SetInsertPoint(pCurBB);
+
+        // Add conditional branch
+        COND_BR(pIsUndef, pPostLoop, pLoop);
+
+        // Add loop basic block contents
+        IRB()->SetInsertPoint(pLoop);
+        PHINode* pIndexPhi = PHI(mInt32Ty, 2);
+        PHINode* pMaskPhi = PHI(mInt32Ty, 2);
+
+        pIndexPhi->addIncoming(pIndex, pCurBB);
+        pMaskPhi->addIncoming(pMask, pCurBB);
+
+        // Extract elements for this index
+        Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
+        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
+
+        // GEP to this offset in dst
+        Value* pCurDst = GEP(pDst, pOffsetElem);
+        pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
+        STORE(pSrcElem, pCurDst);
+
+        // Update the mask
+        Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
+
+        // Terminator
+        Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
+
+        pIsUndef = ICMP_EQ(pNewIndex, C(32));
+        COND_BR(pIsUndef, pPostLoop, pLoop);
+
+        // Update phi edges
+        pIndexPhi->addIncoming(pNewIndex, pLoop);
+        pMaskPhi->addIncoming(pNewMask, pLoop);
+
+        // Move builder to beginning of post loop
+        IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief save/restore stack, providing ability to push/pop the stack and 
+    ///        reduce overall stack requirements for temporary stack use
+    Value* Builder::STACKSAVE()
+    {
+        Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
+        return CALLA(pfnStackSave);
+    }
+
+    void Builder::STACKRESTORE(Value* pSaved)
+    {
+        Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
+        CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
+    }
+
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
new file mode 100644
index 00000000000..6264f398161
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
@@ -0,0 +1,73 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file builder_misc.h
+*
+* @brief miscellaneous builder functions
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
+Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
+Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
+Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
+
+LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = "");
+LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
+StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
+StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
+
+Value *MASKLOADD(Value* src, Value* mask);
+
+void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
+    Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+virtual Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, Value *pDrawContext = nullptr);
+Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
+
+void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+    Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
+Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
+
+void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+    Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
+
+void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
+
+void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
+void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
+
+Value* STACKSAVE();
+void STACKRESTORE(Value* pSaved);
+
+// Static stack allocations for scatter operations
+Value* pScatterStackSrc{ nullptr };
+Value* pScatterStackOffsets{ nullptr };
+
+
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 10a5979d864..0738d023321 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -303,70 +303,6 @@ namespace SwrJit
         return pValConst->getSExtValue();
     }
 
-    Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(i);
-        return GEPA(ptr, indices);
-    }
-
-    Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(C(i));
-        return GEPA(ptr, indices);
-    }
-
-    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(i);
-        return IN_BOUNDS_GEP(ptr, indices);
-    }
-
-    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(C(i));
-        return IN_BOUNDS_GEP(ptr, indices);
-    }
-
-    LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(C(i));
-        return LOAD(GEPA(basePtr, valIndices), name);
-    }
-
-    LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(i);
-        return LOAD(GEPA(basePtr, valIndices), name);
-    }
-
-    StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(C(i));
-        return STORE(val, GEPA(basePtr, valIndices));
-    }
-
-    StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(i);
-        return STORE(val, GEPA(basePtr, valIndices));
-    }
-
     CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name)
     {
         std::vector<Value*> args;
@@ -418,34 +354,6 @@ namespace SwrJit
         return vOut;
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate an i32 masked load operation in LLVM IR.  If not  
-    /// supported on the underlying platform, emulate it with float masked load
-    /// @param src - base address pointer for the load
-    /// @param vMask - SIMD wide mask that controls whether to access memory load 0
-    Value *Builder::MASKLOADD(Value* src,Value* mask)
-    {
-        Value* vResult;
-        // use avx2 gather instruction is available
-        if(JM()->mArch.AVX2())
-        {
-            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
-            vResult = CALL(func,{src,mask});
-        }
-        else
-        {
-            // maskload intrinsic expects integer mask operand in llvm >= 3.8
-    #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
-            mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
-    #else
-            mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
-    #endif
-            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
-            vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
-        }
-        return vResult;
-    }
-
     //////////////////////////////////////////////////////////////////////////
     /// @brief insert a JIT call to CallPrint
     /// - outputs formatted string to both stdout and VS output window
@@ -581,222 +489,6 @@ namespace SwrJit
         return PRINT(printStr, {});
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not  
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, Value *pDrawContext)
-    {
-        Value *vGather;
-
-        // use avx2 gather instruction if available
-        if(JM()->mArch.AVX2())
-        {
-            // force mask to <N x float>, required by vgather
-            Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
-
-            vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
-        }
-        else
-        {
-            Value* pStack = STACKSAVE();
-
-            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-            Value* vSrcPtr = ALLOCA(vSrc->getType());
-            STORE(vSrc, vSrcPtr);
-
-            vGather = VUNDEF_F();
-            Value *vScaleVec = VIMMED1((uint32_t)scale);
-            Value *vOffsets = MUL(vIndices,vScaleVec);
-            for(uint32_t i = 0; i < mVWidth; ++i)
-            {
-                // single component byte index
-                Value *offset = VEXTRACT(vOffsets,C(i));
-                // byte pointer to component
-                Value *loadAddress = GEP(pBase,offset);
-                loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
-                // pointer to the value to load if we're masking off a component
-                Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-                Value *selMask = VEXTRACT(vMask,C(i));
-                // switch in a safe address to load if we're trying to access a vertex 
-                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-                Value *val = LOAD(validAddress);
-                vGather = VINSERT(vGather,val,C(i));
-            }
-
-            STACKRESTORE(pStack);
-        }
-
-        return vGather;
-    }
-
-    Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
-    {
-        Value *vGather = VUNDEF_F_16();
-
-        // use AVX512F gather instruction if available
-        if (JM()->mArch.AVX512F())
-        {
-            // force mask to <N-bit Integer>, required by vgather2
-            Value *mask = BITCAST(vMask, mInt16Ty);
-
-            vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
-        }
-        else
-        {
-            Value *src0 = EXTRACT_16(vSrc, 0);
-            Value *src1 = EXTRACT_16(vSrc, 1);
-
-            Value *indices0 = EXTRACT_16(vIndices, 0);
-            Value *indices1 = EXTRACT_16(vIndices, 1);
-
-            Value *mask0 = EXTRACT_16(vMask, 0);
-            Value *mask1 = EXTRACT_16(vMask, 1);
-
-            Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
-            Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
-
-            vGather = JOIN_16(gather0, gather1);
-        }
-
-        return vGather;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not  
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
-    {
-        Value* vGather;
-
-        // use avx2 gather instruction if available
-        if(JM()->mArch.AVX2())
-        {
-            vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
-        }
-        else
-        {
-            Value* pStack = STACKSAVE();
-
-            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-            Value* vSrcPtr = ALLOCA(vSrc->getType());
-            STORE(vSrc, vSrcPtr);
-
-            vGather = VUNDEF_I();
-            Value *vScaleVec = VIMMED1((uint32_t)scale);
-            Value *vOffsets = MUL(vIndices, vScaleVec);
-            for(uint32_t i = 0; i < mVWidth; ++i)
-            {
-                // single component byte index
-                Value *offset = VEXTRACT(vOffsets, C(i));
-                // byte pointer to component
-                Value *loadAddress = GEP(pBase, offset);
-                loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
-                // pointer to the value to load if we're masking off a component
-                Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
-                Value *selMask = VEXTRACT(vMask, C(i));
-                // switch in a safe address to load if we're trying to access a vertex 
-                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-                Value *val = LOAD(validAddress, C(0));
-                vGather = VINSERT(vGather, val, C(i));
-            }
-
-            STACKRESTORE(pStack);
-        }
-
-        return vGather;
-    }
-
-    Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
-    {
-        Value *vGather = VUNDEF_I_16();
-
-        // use AVX512F gather instruction if available
-        if (JM()->mArch.AVX512F())
-        {
-            // force mask to <N-bit Integer>, required by vgather2
-            Value *mask = BITCAST(vMask, mInt16Ty);
-
-            vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
-        }
-        else
-        {
-            Value *src0 = EXTRACT_16(vSrc, 0);
-            Value *src1 = EXTRACT_16(vSrc, 1);
-
-            Value *indices0 = EXTRACT_16(vIndices, 0);
-            Value *indices1 = EXTRACT_16(vIndices, 1);
-
-            Value *mask0 = EXTRACT_16(vMask, 0);
-            Value *mask1 = EXTRACT_16(vMask, 1);
-
-            Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
-            Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
-
-            vGather = JOIN_16(gather0, gather1);
-        }
-
-        return vGather;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
-    {
-        Value* vGather;
-
-        // use avx2 gather instruction if available
-        if(JM()->mArch.AVX2())
-        {
-            vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
-            vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
-        }
-        else
-        {
-            Value* pStack = STACKSAVE();
-
-            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-            Value* vSrcPtr = ALLOCA(vSrc->getType());
-            STORE(vSrc, vSrcPtr);
-
-            vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
-            Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
-            Value *vOffsets = MUL(vIndices,vScaleVec);
-            for(uint32_t i = 0; i < mVWidth/2; ++i)
-            {
-                // single component byte index
-                Value *offset = VEXTRACT(vOffsets,C(i));
-                // byte pointer to component
-                Value *loadAddress = GEP(pBase,offset);
-                loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
-                // pointer to the value to load if we're masking off a component
-                Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-                Value *selMask = VEXTRACT(vMask,C(i));
-                // switch in a safe address to load if we're trying to access a vertex
-                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-                Value *val = LOAD(validAddress);
-                vGather = VINSERT(vGather,val,C(i));
-            }
-            STACKRESTORE(pStack);
-        }
-        return vGather;
-    }
-
     Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
     {
         if (imm == 0)
@@ -1064,360 +756,6 @@ namespace SwrJit
         return SELECT(cmp, a, b);
     }
 
-    void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, 
-                          Value* mask, Value* vGatherComponents[], bool bPackedOutput)
-    {
-        const SWR_FORMAT_INFO &info = GetFormatInfo(format);
-        if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
-        {
-            GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
-        }
-        else
-        {
-            GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
-        }
-    }
-
-    void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, 
-                            Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
-    {
-        switch(info.bpp / info.numComps)
-        {
-            case 16: 
-            {
-                    Value* vGatherResult[2];
-
-                    // TODO: vGatherMaskedVal
-                    Value* vGatherMaskedVal = VIMMED1((float)0);
-
-                    // always have at least one component out of x or y to fetch
-
-                    vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
-                    // e.g. result of first 8x32bit integer gather for 16bit components
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-                    //
-
-                    // if we have at least one component out of x or y to fetch
-                    if(info.numComps > 2)
-                    {
-                        // offset base to the next components(zw) in the vertex to gather
-                        pSrcBase = GEP(pSrcBase, C((char)4));
-
-                        vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
-                        // e.g. result of second 8x32bit integer gather for 16bit components
-                        // 256i - 0    1    2    3    4    5    6    7
-                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-                        //
-                    }
-                    else
-                    {
-                        vGatherResult[1] =  vGatherMaskedVal;
-                    }
-
-                    // Shuffle gathered components into place, each row is a component
-                    Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
-            }
-                break;
-            case 32: 
-            { 
-                // apply defaults
-                for (uint32_t i = 0; i < 4; ++i)
-                {
-                    vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
-                }
-
-                for(uint32_t i = 0; i < info.numComps; i++)
-                {
-                    uint32_t swizzleIndex = info.swizzle[i];
-
-                    // Gather a SIMD of components
-                    vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
-
-                    // offset base to the next component to gather
-                    pSrcBase = GEP(pSrcBase, C((char)4));
-                }
-            }
-                break;
-            default:
-                SWR_INVALID("Invalid float format");
-                break;
-        }
-    }
-
-    void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-                            Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
-    {
-        switch (info.bpp / info.numComps)
-        {
-            case 8:
-            {
-                Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-                Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
-                // e.g. result of an 8x32bit integer gather for 8bit components
-                // 256i - 0    1    2    3    4    5    6    7
-                //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
-
-                Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
-            }
-                break;
-            case 16:
-            {
-                Value* vGatherResult[2];
-
-                // TODO: vGatherMaskedVal
-                Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-
-                // always have at least one component out of x or y to fetch
-
-                vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
-                // e.g. result of first 8x32bit integer gather for 16bit components
-                // 256i - 0    1    2    3    4    5    6    7
-                //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-                //
-
-                // if we have at least one component out of x or y to fetch
-                if(info.numComps > 2)
-                {
-                    // offset base to the next components(zw) in the vertex to gather
-                    pSrcBase = GEP(pSrcBase, C((char)4));
-
-                    vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
-                    // e.g. result of second 8x32bit integer gather for 16bit components
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-                    //
-                }
-                else
-                {
-                    vGatherResult[1] = vGatherMaskedVal;
-                }
-
-                // Shuffle gathered components into place, each row is a component
-                Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
-
-            }
-                break;
-            case 32:
-            {
-                // apply defaults
-                for (uint32_t i = 0; i < 4; ++i)
-                {
-                    vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
-                }
-
-                for(uint32_t i = 0; i < info.numComps; i++)
-                {
-                    uint32_t swizzleIndex = info.swizzle[i];
-
-                    // Gather a SIMD of components
-                    vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
-
-                    // offset base to the next component to gather
-                    pSrcBase = GEP(pSrcBase, C((char)4));
-                }
-            }
-                break;
-            default:
-                SWR_INVALID("unsupported format");
-            break;
-        }
-    }
-
-    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
-    {
-        // cast types
-        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-        Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
-
-        // input could either be float or int vector; do shuffle work in int
-        vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
-        vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
-
-        if(bPackedOutput) 
-        {
-            Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-
-            // shuffle mask
-            Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
-            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
-            // after pshufb: group components together in each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
-
-            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-            // after PERMD: move and pack xy components into each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-
-            // do the same for zw components
-            Value* vi128ZW = nullptr;
-            if(info.numComps > 2) 
-            {
-                Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
-                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-            }
-
-            for(uint32_t i = 0; i < 4; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-                // todo: fixed for packed
-                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-                if(i >= info.numComps)
-                {
-                    // set the default component val
-                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-                    continue;
-                }
-
-                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
-                // extract packed component 128 bit lanes 
-                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-            }
-
-        }
-        else 
-        {
-            // pshufb masks for each component
-            Value* vConstMask[2];
-            // x/z shuffle mask
-            vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-                                     0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
-
-            // y/w shuffle mask
-            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
-
-
-            // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
-            // apply defaults
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
-            }
-
-            for(uint32_t i = 0; i < info.numComps; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-
-                // select correct constMask for x/z or y/w pshufb
-                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                uint32_t selectedGather = (i < 2) ? 0 : 1;
-
-                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
-                // after pshufb mask for x channel; z uses the same shuffle from the second gather
-                // 256i - 0    1    2    3    4    5    6    7
-                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
-            }
-        }
-    }
-
-    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
-    {
-        // cast types
-        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-        Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
-
-        if(bPackedOutput)
-        {
-            Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-            // shuffle mask
-            Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
-                                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
-            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-            // after pshufb: group components together in each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
-
-            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
-            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
-
-            // do the same for zw components
-            Value* vi128ZW = nullptr;
-            if(info.numComps > 2) 
-            {
-                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
-            }
-
-            // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
-            for(uint32_t i = 0; i < 4; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-                // todo: fix for packed
-                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-                if(i >= info.numComps)
-                {
-                    // set the default component val
-                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-                    continue;
-                }
-
-                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-            
-                // sign extend
-                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-            }
-        }
-        // else zero extend
-        else{
-            // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
-            // apply defaults
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
-            }
-
-            for(uint32_t i = 0; i < info.numComps; i++){
-                uint32_t swizzleIndex = info.swizzle[i];
-
-                // pshufb masks for each component
-                Value* vConstMask;
-                switch(i)
-                {
-                    case 0:
-                        // x shuffle mask
-                        vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
-                                              0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
-                        break;
-                    case 1:
-                        // y shuffle mask
-                        vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
-                                              1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
-                        break;
-                    case 2:
-                        // z shuffle mask
-                        vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
-                                              2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
-                        break;
-                    case 3:
-                        // w shuffle mask
-                        vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
-                                              3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
-                        break;
-                    default:
-                        vConstMask = nullptr;
-                        break;
-                }
-
-                    vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-                    // after pshufb for x channel
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        x000 x000 x000 x000 x000 x000 x000 x000 
-            }
-        }
-    }
-
     // Helper function to create alloca in entry block of function
     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
     {
@@ -1439,105 +777,6 @@ namespace SwrJit
         return pAlloca;
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief emulates a scatter operation.
-    /// @param pDst - pointer to destination 
-    /// @param vSrc - vector of src data to scatter
-    /// @param vOffsets - vector of byte offsets from pDst
-    /// @param vMask - mask of valid lanes
-    void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
-    {
-        /* Scatter algorithm
-    
-           while(Index = BitScanForward(mask))
-                srcElem = srcVector[Index]
-                offsetElem = offsetVector[Index]
-                *(pDst + offsetElem) = srcElem
-                Update mask (&= ~(1<<Index)
-
-        */
-
-        BasicBlock* pCurBB = IRB()->GetInsertBlock();
-        Function* pFunc = pCurBB->getParent();
-        Type* pSrcTy = vSrc->getType()->getVectorElementType();
-
-        // Store vectors on stack
-        if (pScatterStackSrc == nullptr)
-        {
-            // Save off stack allocations and reuse per scatter. Significantly reduces stack
-            // requirements for shaders with a lot of scatters.
-            pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
-            pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
-        }
-    
-        Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
-        Value* pOffsetsArrayPtr = pScatterStackOffsets;
-        STORE(vSrc, pSrcArrayPtr);
-        STORE(vOffsets, pOffsetsArrayPtr);
-
-        // Cast to pointers for random access
-        pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
-        pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
-
-        Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
-
-        // Get cttz function
-        Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
-    
-        // Setup loop basic block
-        BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
-
-        // compute first set bit
-        Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
-
-        Value* pIsUndef = ICMP_EQ(pIndex, C(32));
-
-        // Split current block
-        BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
-
-        // Remove unconditional jump created by splitBasicBlock
-        pCurBB->getTerminator()->eraseFromParent();
-
-        // Add terminator to end of original block
-        IRB()->SetInsertPoint(pCurBB);
-
-        // Add conditional branch
-        COND_BR(pIsUndef, pPostLoop, pLoop);
-
-        // Add loop basic block contents
-        IRB()->SetInsertPoint(pLoop);
-        PHINode* pIndexPhi = PHI(mInt32Ty, 2);
-        PHINode* pMaskPhi = PHI(mInt32Ty, 2);
-
-        pIndexPhi->addIncoming(pIndex, pCurBB);
-        pMaskPhi->addIncoming(pMask, pCurBB);
-
-        // Extract elements for this index
-        Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
-        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
-
-        // GEP to this offset in dst
-        Value* pCurDst = GEP(pDst, pOffsetElem);
-        pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
-        STORE(pSrcElem, pCurDst);
-
-        // Update the mask
-        Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
-
-        // Terminator
-        Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
-
-        pIsUndef = ICMP_EQ(pNewIndex, C(32));
-        COND_BR(pIsUndef, pPostLoop, pLoop);
-
-        // Update phi edges
-        pIndexPhi->addIncoming(pNewIndex, pLoop);
-        pMaskPhi->addIncoming(pNewMask, pLoop);
-
-        // Move builder to beginning of post loop
-        IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
-    }
-
     Value* Builder::VABSPS(Value* a)
     {
         Value* asInt = BITCAST(a, mSimdInt32Ty);
@@ -1575,21 +814,6 @@ namespace SwrJit
         return result;
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief save/restore stack, providing ability to push/pop the stack and 
-    ///        reduce overall stack requirements for temporary stack use
-    Value* Builder::STACKSAVE()
-    {
-        Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
-        return CALLA(pfnStackSave);
-    }
-
-    void Builder::STACKRESTORE(Value* pSaved)
-    {
-        Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
-        CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
-    }
-
     Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
     {
         Value* vOut;
@@ -1707,7 +931,6 @@ namespace SwrJit
         }
     }
 
-
     uint32_t Builder::GetTypeSize(Type* pType)
     {
         if (pType->isStructTy())
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 891b31d3201..50d7a1e71fa 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -90,22 +90,12 @@ Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
 uint32_t IMMED(Value* i);
 int32_t S_IMMED(Value* i);
 
-Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
-Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
-Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
-Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
-
 CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args, const llvm::Twine& name = "");
 CallInst *CALL(Value *Callee) { return CALLA(Callee); }
 CallInst *CALL(Value *Callee, Value* arg);
 CallInst *CALL2(Value *Callee, Value* arg1, Value* arg2);
 CallInst *CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3);
 
-LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = "");
-LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
-StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
-StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
-
 Value *VCMPPS_EQ(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_EQ_OQ)); }
 Value *VCMPPS_LT(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_LT_OQ)); }
 Value *VCMPPS_LE(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_LE_OQ)); }
@@ -129,30 +119,6 @@ Value *VMASK_16(Value *mask);
 Value *EXTRACT_16(Value *x, uint32_t imm);
 Value *JOIN_16(Value *a, Value *b);
 
-Value *MASKLOADD(Value* src, Value* mask);
-
-void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
-                      Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-
-virtual Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, Value *pDrawContext = nullptr);
-Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
-
-void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-               Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-
-Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
-Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1);
-
-void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-               Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-
-Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
-
-void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
-
-void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
-void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
-
 Value *PSHUFB(Value* a, Value* b);
 Value *PMOVSXBD(Value* a);
 Value *PMOVSXWD(Value* a);
@@ -180,8 +146,6 @@ Value *FCLAMP(Value* src, float low, float high);
 
 CallInst *PRINT(const std::string &printStr);
 CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs);
-Value* STACKSAVE();
-void STACKRESTORE(Value* pSaved);
 
 Value* POPCNT(Value* a);
 
@@ -199,9 +163,4 @@ void RDTSC_STOP(Value* pBucketMgr, Value* pId);
 Value* CreateEntryAlloca(Function* pFunc, Type* pType);
 Value* CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize);
 
-// Static stack allocations for scatter operations
-Value* pScatterStackSrc{ nullptr };
-Value* pScatterStackOffsets{ nullptr };
-
-
 uint32_t GetTypeSize(Type* pType);