gallium/swr: Fix compilation warnings

[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / builder_misc.cpp
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp

index daa9cb1ec119063c6cdfb784ccf7fb71ddd4d66b..0ee727cc8d8a5f6d959649fa7a8c3b625d8c5c99 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -1,41 +1,42 @@
  /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file builder_misc.cpp
-* 
-* @brief Implementation for miscellaneous builder functions
-* 
-* Notes:
-* 
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file builder_misc.cpp
+ *
+ * @brief Implementation for miscellaneous builder functions
+ *
+ * Notes:
+ *
+ ******************************************************************************/
+#include "jit_pch.hpp"
  #include "builder.h"
  #include "common/rdtsc_buckets.h"
  
  #include <cstdarg>
  
+extern "C" void CallPrint(const char* fmt, ...);
+
  namespace SwrJit
  {
-    void __cdecl CallPrint(const char* fmt, ...);
-
      //////////////////////////////////////////////////////////////////////////
      /// @brief Convert an IEEE 754 32-bit single precision float to an
      ///        16 bit float with 5 exponent bits and a variable
@@ -49,25 +50,25 @@ namespace SwrJit
  
          // Extract the sign, exponent, and mantissa
          uint32_t uf = *(uint32_t*)&val;
-        sign = (uf & 0x80000000) >> 31;
-        exp = (uf & 0x7F800000) >> 23;
-        mant = uf & 0x007FFFFF;
+        sign        = (uf & 0x80000000) >> 31;
+        exp         = (uf & 0x7F800000) >> 23;
+        mant        = uf & 0x007FFFFF;
  
          // Check for out of range
          if (std::isnan(val))
          {
-            exp = 0x1F;
+            exp  = 0x1F;
              mant = 0x200;
-            sign = 1;                     // set the sign bit for NANs
+            sign = 1; // set the sign bit for NANs
          }
          else if (std::isinf(val))
          {
-            exp = 0x1f;
+            exp  = 0x1f;
              mant = 0x0;
          }
          else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
          {
-            exp = 0x1E;
+            exp  = 0x1E;
              mant = 0x3FF;
          }
          else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
@@ -75,12 +76,12 @@ namespace SwrJit
              mant |= 0x00800000;
              for (; exp <= 0x70; mant >>= 1, exp++)
                  ;
-            exp = 0;
+            exp  = 0;
              mant = mant >> 13;
          }
          else if (exp < 0x66) // Too small to represent -> Zero
          {
-            exp = 0;
+            exp  = 0;
              mant = 0;
          }
          else
@@ -88,7 +89,7 @@ namespace SwrJit
              // Saves bits that will be shifted off for rounding
              roundBits = mant & 0x1FFFu;
              // convert exponent and mantissa to 16 bit format
-            exp = exp - 0x70;
+            exp  = exp - 0x70;
              mant = mant >> 13;
  
              // Essentially RTZ, but round up if off by only 1 lsb
@@ -107,143 +108,148 @@ namespace SwrJit
          return (uint16_t)tmpVal;
      }
  
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
-    ///        float
-    /// @param val - 16-bit float
-    /// @todo Maybe move this outside of this file into a header?
-    static float ConvertFloat16ToFloat32(uint32_t val)
-    {
-        uint32_t result;
-        if ((val & 0x7fff) == 0)
-        {
-            result = ((uint32_t)(val & 0x8000)) << 16;
-        }
-        else if ((val & 0x7c00) == 0x7c00)
-        {
-            result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
-            result |= ((uint32_t)val & 0x8000) << 16;
-        }
-        else
-        {
-            uint32_t sign = (val & 0x8000) << 16;
-            uint32_t mant = (val & 0x3ff) << 13;
-            uint32_t exp = (val >> 10) & 0x1f;
-            if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
-            {
-                mant <<= 1;
-                while (mant < (0x400 << 13))
-                {
-                    exp--;
-                    mant <<= 1;
-                }
-                mant &= (0x3ff << 13);
-            }
-            exp = ((exp - 15 + 127) & 0xff) << 23;
-            result = sign | exp | mant;
-        }
+    Constant* Builder::C(bool i) { return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); }
  
-        return *(float*)&result;
-    }
+    Constant* Builder::C(char i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
  
-    Constant *Builder::C(bool i)
-    {
-        return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
-    }
+    Constant* Builder::C(uint8_t i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
  
-    Constant *Builder::C(char i)
-    {
-        return ConstantInt::get(IRB()->getInt8Ty(), i);
-    }
+    Constant* Builder::C(int i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
  
-    Constant *Builder::C(uint8_t i)
-    {
-        return ConstantInt::get(IRB()->getInt8Ty(), i);
-    }
+    Constant* Builder::C(int64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
  
-    Constant *Builder::C(int i)
-    {
-        return ConstantInt::get(IRB()->getInt32Ty(), i);
-    }
+    Constant* Builder::C(uint16_t i) { return ConstantInt::get(mInt16Ty, i); }
+
+    Constant* Builder::C(uint32_t i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
+
+    Constant* Builder::C(uint64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
+
+    Constant* Builder::C(float i) { return ConstantFP::get(IRB()->getFloatTy(), i); }
  
-    Constant *Builder::C(int64_t i)
+    Constant* Builder::PRED(bool pred)
      {
-        return ConstantInt::get(IRB()->getInt64Ty(), i);
+        return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
      }
  
-    Constant *Builder::C(uint16_t i)
+    Value* Builder::VIMMED1(uint64_t i)
      {
-        return ConstantInt::get(mInt16Ty,i);
+#if LLVM_VERSION_MAJOR > 10
+        return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
+#else
+        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+#endif
      }
  
-    Constant *Builder::C(uint32_t i)
+    Value* Builder::VIMMED1_16(uint64_t i)
      {
-        return ConstantInt::get(IRB()->getInt32Ty(), i);
+#if LLVM_VERSION_MAJOR > 10
+        return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
+#else
+        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
+#endif
      }
  
-    Constant *Builder::C(float i)
+    Value* Builder::VIMMED1(int i)
      {
-        return ConstantFP::get(IRB()->getFloatTy(), i);
+#if LLVM_VERSION_MAJOR > 10
+        return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
+#else
+        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+#endif
      }
  
-    Constant *Builder::PRED(bool pred)
+    Value* Builder::VIMMED1_16(int i)
      {
-        return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
+#if LLVM_VERSION_MAJOR > 10
+        return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
+#else
+        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
+#endif
      }
  
-    Value *Builder::VIMMED1(int i)
+    Value* Builder::VIMMED1(uint32_t i)
      {
+#if LLVM_VERSION_MAJOR > 10
+        return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
+#else
          return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+#endif
      }
  
-    Value *Builder::VIMMED1(uint32_t i)
+    Value* Builder::VIMMED1_16(uint32_t i)
      {
-        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+#if LLVM_VERSION_MAJOR > 10
+        return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
+#else
+        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
+#endif
      }
  
-    Value *Builder::VIMMED1(float i)
+    Value* Builder::VIMMED1(float i)
      {
+#if LLVM_VERSION_MAJOR > 10
+        return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantFP>(C(i)));
+#else
          return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
+#endif
      }
  
-    Value *Builder::VIMMED1(bool i)
+    Value* Builder::VIMMED1_16(float i)
      {
-        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+#if LLVM_VERSION_MAJOR > 10
+        return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantFP>(C(i)));
+#else
+        return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
+#endif
      }
  
-    Value *Builder::VUNDEF_IPTR()
+    Value* Builder::VIMMED1(bool i)
      {
-        return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
+#if LLVM_VERSION_MAJOR > 10
+        return ConstantVector::getSplat(ElementCount(mVWidth, false), cast<ConstantInt>(C(i)));
+#else
+        return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+#endif
      }
  
-    Value *Builder::VUNDEF_I()
+    Value* Builder::VIMMED1_16(bool i)
      {
-        return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
+#if LLVM_VERSION_MAJOR > 10
+        return ConstantVector::getSplat(ElementCount(mVWidth16, false), cast<ConstantInt>(C(i)));
+#else
+        return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
+#endif
      }
  
-    Value *Builder::VUNDEF(Type *ty, uint32_t size)
-    {
-        return UndefValue::get(VectorType::get(ty, size));
-    }
+    Value* Builder::VUNDEF_IPTR() { return UndefValue::get(VectorType::get(mInt32PtrTy, mVWidth)); }
  
-    Value *Builder::VUNDEF_F()
-    {
-        return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
-    }
+    Value* Builder::VUNDEF(Type* t) { return UndefValue::get(VectorType::get(t, mVWidth)); }
+
+    Value* Builder::VUNDEF_I() { return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); }
+
+    Value* Builder::VUNDEF_I_16() { return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16)); }
+
+    Value* Builder::VUNDEF_F() { return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); }
  
-#if USE_SIMD16_BUILDER
-    Value *Builder::VUNDEF2_F()
+    Value* Builder::VUNDEF_F_16() { return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16)); }
+
+    Value* Builder::VUNDEF(Type* ty, uint32_t size)
      {
-        return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
+        return UndefValue::get(VectorType::get(ty, size));
      }
  
-#endif
-    Value *Builder::VUNDEF(Type* t)
+    Value* Builder::VBROADCAST(Value* src, const llvm::Twine& name)
      {
-        return UndefValue::get(VectorType::get(t, mVWidth));
+        // check if src is already a vector
+        if (src->getType()->isVectorTy())
+        {
+            return src;
+        }
+
+        return VECTOR_SPLAT(mVWidth, src, name);
      }
  
-    Value *Builder::VBROADCAST(Value *src)
+    Value* Builder::VBROADCAST_16(Value* src)
      {
          // check if src is already a vector
          if (src->getType()->isVectorTy())
@@ -251,166 +257,89 @@ namespace SwrJit
              return src;
          }
  
-        return VECTOR_SPLAT(mVWidth, src);
+        return VECTOR_SPLAT(mVWidth16, src);
      }
  
      uint32_t Builder::IMMED(Value* v)
      {
          SWR_ASSERT(isa<ConstantInt>(v));
-        ConstantInt *pValConst = cast<ConstantInt>(v);
+        ConstantInt* pValConst = cast<ConstantInt>(v);
          return pValConst->getZExtValue();
      }
  
      int32_t Builder::S_IMMED(Value* v)
      {
          SWR_ASSERT(isa<ConstantInt>(v));
-        ConstantInt *pValConst = cast<ConstantInt>(v);
+        ConstantInt* pValConst = cast<ConstantInt>(v);
          return pValConst->getSExtValue();
      }
  
-    Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(i);
-        return GEPA(ptr, indices);
-    }
-
-    Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(C(i));
-        return GEPA(ptr, indices);
-    }
-
-    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(i);
-        return IN_BOUNDS_GEP(ptr, indices);
-    }
-
-    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
-    {
-        std::vector<Value*> indices;
-        for (auto i : indexList)
-            indices.push_back(C(i));
-        return IN_BOUNDS_GEP(ptr, indices);
-    }
-
-    LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(C(i));
-        return LOAD(GEPA(basePtr, valIndices), name);
-    }
-
-    LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(i);
-        return LOAD(GEPA(basePtr, valIndices), name);
-    }
-
-    StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(C(i));
-        return STORE(val, GEPA(basePtr, valIndices));
-    }
-
-    StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
-    {
-        std::vector<Value*> valIndices;
-        for (auto i : indices)
-            valIndices.push_back(i);
-        return STORE(val, GEPA(basePtr, valIndices));
-    }
-
-    CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
+    CallInst* Builder::CALL(Value*                               Callee,
+                            const std::initializer_list<Value*>& argsList,
+                            const llvm::Twine&                   name)
      {
          std::vector<Value*> args;
          for (auto arg : argsList)
              args.push_back(arg);
-        return CALLA(Callee, args);
+#if LLVM_VERSION_MAJOR >= 11
+        // see comment to CALLA(Callee) function in the header
+        return CALLA(FunctionCallee(cast<Function>(Callee)), args, name);
+#else
+        return CALLA(Callee, args, name);
+#endif
      }
  
-    CallInst *Builder::CALL(Value *Callee, Value* arg)
+    CallInst* Builder::CALL(Value* Callee, Value* arg)
      {
          std::vector<Value*> args;
          args.push_back(arg);
+#if LLVM_VERSION_MAJOR >= 11
+        // see comment to CALLA(Callee) function in the header
+        return CALLA(FunctionCallee(cast<Function>(Callee)), args);
+#else
          return CALLA(Callee, args);
+#endif
      }
  
-    CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
+    CallInst* Builder::CALL2(Value* Callee, Value* arg1, Value* arg2)
      {
          std::vector<Value*> args;
          args.push_back(arg1);
          args.push_back(arg2);
+#if LLVM_VERSION_MAJOR >= 11
+        // see comment to CALLA(Callee) function in the header
+        return CALLA(FunctionCallee(cast<Function>(Callee)), args);
+#else
          return CALLA(Callee, args);
+#endif
      }
  
-    CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
+    CallInst* Builder::CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3)
      {
          std::vector<Value*> args;
          args.push_back(arg1);
          args.push_back(arg2);
          args.push_back(arg3);
+#if LLVM_VERSION_MAJOR >= 11
+        // see comment to CALLA(Callee) function in the header
+        return CALLA(FunctionCallee(cast<Function>(Callee)), args);
+#else
          return CALLA(Callee, args);
+#endif
      }
  
-    //////////////////////////////////////////////////////////////////////////
-    Value *Builder::DEBUGTRAP()
-    {
-        Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
-        return CALL(func);
-    }
-
-    Value *Builder::VRCP(Value *va)
+    Value* Builder::VRCP(Value* va, const llvm::Twine& name)
      {
-        return FDIV(VIMMED1(1.0f), va);  // 1 / a
+        return FDIV(VIMMED1(1.0f), va, name); // 1 / a
      }
  
-    Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
+    Value* Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY)
      {
          Value* vOut = FMADDPS(vA, vX, vC);
-        vOut = FMADDPS(vB, vY, vOut);
+        vOut        = FMADDPS(vB, vY, vOut);
          return vOut;
      }
  
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate an i32 masked load operation in LLVM IR.  If not  
-    /// supported on the underlying platform, emulate it with float masked load
-    /// @param src - base address pointer for the load
-    /// @param vMask - SIMD wide mask that controls whether to access memory load 0
-    Value *Builder::MASKLOADD(Value* src,Value* mask)
-    {
-        Value* vResult;
-        // use avx2 gather instruction is available
-        if(JM()->mArch.AVX2())
-        {
-            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
-            vResult = CALL(func,{src,mask});
-        }
-        else
-        {
-            // maskload intrinsic expects integer mask operand in llvm >= 3.8
-    #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8)
-            mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
-    #else
-            mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
-    #endif
-            Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
-            vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth));
-        }
-        return vResult;
-    }
-
      //////////////////////////////////////////////////////////////////////////
      /// @brief insert a JIT call to CallPrint
      /// - outputs formatted string to both stdout and VS output window
@@ -421,7 +350,8 @@ namespace SwrJit
      ///   result from a GEP, printing out the pointer to memory
      /// @param printStr - constant string to print, which includes format specifiers
      /// @param printArgs - initializer list of Value*'s to print to std out
-    CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
+    CallInst* Builder::PRINT(const std::string&                   printStr,
+                             const std::initializer_list<Value*>& printArgs)
      {
          // push the arguments to CallPrint into a vector
          std::vector<Value*> printCallArgs;
@@ -429,23 +359,25 @@ namespace SwrJit
          printCallArgs.resize(1);
  
          // search through the format string for special processing
-        size_t pos = 0;
+        size_t      pos = 0;
          std::string tempStr(printStr);
-        pos = tempStr.find('%', pos);
+        pos    = tempStr.find('%', pos);
          auto v = printArgs.begin();
  
          while ((pos != std::string::npos) && (v != printArgs.end()))
          {
-            Value* pArg = *v;
-            Type* pType = pArg->getType();
+            Value* pArg  = *v;
+            Type*  pType = pArg->getType();
  
              if (pType->isVectorTy())
              {
                  Type* pContainedType = pType->getContainedType(0);
-
+#if LLVM_VERSION_MAJOR >= 11
+                VectorType* pVectorType = cast<VectorType>(pType);
+#endif
                  if (toupper(tempStr[pos + 1]) == 'X')
                  {
-                    tempStr[pos] = '0';
+                    tempStr[pos]     = '0';
                      tempStr[pos + 1] = 'x';
                      tempStr.insert(pos + 2, "%08X ");
                      pos += 7;
@@ -453,7 +385,11 @@ namespace SwrJit
                      printCallArgs.push_back(VEXTRACT(pArg, C(0)));
  
                      std::string vectorFormatStr;
+#if LLVM_VERSION_MAJOR >= 11
+                    for (uint32_t i = 1; i < pVectorType->getNumElements(); ++i)
+#else
                      for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
+#endif
                      {
                          vectorFormatStr += "0x%08X ";
                          printCallArgs.push_back(VEXTRACT(pArg, C(i)));
@@ -465,24 +401,53 @@ namespace SwrJit
                  else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
                  {
                      uint32_t i = 0;
-                    for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
+#if LLVM_VERSION_MAJOR >= 11
+                    for (; i < pVectorType->getNumElements() - 1; i++)
+#else
+                    for (; i < pType->getVectorNumElements() - 1; i++)
+#endif
                      {
                          tempStr.insert(pos, std::string("%f "));
                          pos += 3;
-                        printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
+                        printCallArgs.push_back(
+                            FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
                      }
-                    printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
+                    printCallArgs.push_back(
+                        FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
                  }
                  else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
                  {
                      uint32_t i = 0;
-                    for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
+#if LLVM_VERSION_MAJOR >= 11
+                    for (; i < pVectorType->getNumElements() - 1; i++)
+#else
+                    for (; i < pType->getVectorNumElements() - 1; i++)
+#endif
                      {
                          tempStr.insert(pos, std::string("%d "));
                          pos += 3;
-                        printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                        printCallArgs.push_back(
+                            S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
                      }
-                    printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                    printCallArgs.push_back(
+                        S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
+                }
+                else if ((tempStr[pos + 1] == 'u') && (pContainedType->isIntegerTy()))
+                {
+                    uint32_t i = 0;
+#if LLVM_VERSION_MAJOR >= 11
+                    for (; i < pVectorType->getNumElements() - 1; i++)
+#else
+                    for (; i < pType->getVectorNumElements() - 1; i++)
+#endif
+                    {
+                        tempStr.insert(pos, std::string("%d "));
+                        pos += 3;
+                        printCallArgs.push_back(
+                            Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
+                    }
+                    printCallArgs.push_back(
+                        Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
                  }
              }
              else
@@ -512,270 +477,128 @@ namespace SwrJit
          }
  
          // create global variable constant string
-        Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
-        GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
+        Constant*       constString = ConstantDataArray::getString(JM()->mContext, tempStr, true);
+        GlobalVariable* gvPtr       = new GlobalVariable(
+            constString->getType(), true, GlobalValue::InternalLinkage, constString, "printStr");
          JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
  
          // get a pointer to the first character in the constant string array
-        std::vector<Constant*> geplist{C(0),C(0)};
-        Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
+        std::vector<Constant*> geplist{C(0), C(0)};
+        Constant* strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr, geplist, false);
  
          // insert the pointer to the format string in the argument vector
          printCallArgs[0] = strGEP;
  
          // get pointer to CallPrint function and insert decl into the module if needed
          std::vector<Type*> args;
-        args.push_back(PointerType::get(mInt8Ty,0));
-        FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
-        Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
+        args.push_back(PointerType::get(mInt8Ty, 0));
+        FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, true);
+        Function*     callPrintFn =
+#if LLVM_VERSION_MAJOR >= 9
+            cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy).getCallee());
+#else
+            cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
+#endif
  
          // if we haven't yet added the symbol to the symbol table
-        if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
+        if ((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
          {
-            sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
+            sys::DynamicLibrary::AddSymbol("CallPrint", (void*)&CallPrint);
          }
  
          // insert a call to CallPrint
-        return CALLA(callPrintFn,printCallArgs);
+        return CALLA(callPrintFn, printCallArgs);
      }
  
      //////////////////////////////////////////////////////////////////////////
      /// @brief Wrapper around PRINT with initializer list.
-    CallInst* Builder::PRINT(const std::string &printStr)
-    {
-        return PRINT(printStr, {});
-    }
+    CallInst* Builder::PRINT(const std::string& printStr) { return PRINT(printStr, {}); }
  
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not  
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+    Value* Builder::EXTRACT_16(Value* x, uint32_t imm)
      {
-        Value* vGather;
-
-        // use avx2 gather instruction if available
-        if(JM()->mArch.AVX2())
+        if (imm == 0)
          {
-            // force mask to <N x float>, required by vgather
-            vMask = BITCAST(vMask, mSimdFP32Ty);
-            vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
+            return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7});
          }
          else
          {
-            Value* pStack = STACKSAVE();
-
-            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-            Value* vSrcPtr = ALLOCA(vSrc->getType());
-            STORE(vSrc, vSrcPtr);
-
-            vGather = VUNDEF_F();
-            Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
-            Value *vOffsets = MUL(vIndices,vScaleVec);
-            Value *mask = MASK(vMask);
-            for(uint32_t i = 0; i < mVWidth; ++i)
-            {
-                // single component byte index
-                Value *offset = VEXTRACT(vOffsets,C(i));
-                // byte pointer to component
-                Value *loadAddress = GEP(pBase,offset);
-                loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
-                // pointer to the value to load if we're masking off a component
-                Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-                Value *selMask = VEXTRACT(mask,C(i));
-                // switch in a safe address to load if we're trying to access a vertex 
-                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-                Value *val = LOAD(validAddress);
-                vGather = VINSERT(vGather,val,C(i));
-            }
-            STACKRESTORE(pStack);
+            return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15});
          }
-
-        return vGather;
      }
  
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not  
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+    Value* Builder::JOIN_16(Value* a, Value* b)
      {
-        Value* vGather;
-
-        // use avx2 gather instruction if available
-        if(JM()->mArch.AVX2())
-        {
-            vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
-        }
-        else
-        {
-            Value* pStack = STACKSAVE();
-
-            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-            Value* vSrcPtr = ALLOCA(vSrc->getType());
-            STORE(vSrc, vSrcPtr);
-
-            vGather = VUNDEF_I();
-            Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
-            Value *vOffsets = MUL(vIndices, vScaleVec);
-            Value *mask = MASK(vMask);
-            for(uint32_t i = 0; i < mVWidth; ++i)
-            {
-                // single component byte index
-                Value *offset = VEXTRACT(vOffsets, C(i));
-                // byte pointer to component
-                Value *loadAddress = GEP(pBase, offset);
-                loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
-                // pointer to the value to load if we're masking off a component
-                Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
-                Value *selMask = VEXTRACT(mask, C(i));
-                // switch in a safe address to load if we're trying to access a vertex 
-                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-                Value *val = LOAD(validAddress, C(0));
-                vGather = VINSERT(vGather, val, C(i));
-            }
-
-            STACKRESTORE(pStack);
-        }
-        return vGather;
+        return VSHUFFLE(a, b, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
      }
  
      //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+    /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
+    Value* Builder::MASK(Value* vmask)
      {
-        Value* vGather;
-
-        // use avx2 gather instruction if available
-        if(JM()->mArch.AVX2())
-        {
-            vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale);
-        }
-        else
-        {
-            Value* pStack = STACKSAVE();
-
-            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-            Value* vSrcPtr = ALLOCA(vSrc->getType());
-            STORE(vSrc, vSrcPtr);
-
-            vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
-            Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty));
-            Value *vOffsets = MUL(vIndices,vScaleVec);
-            Value *mask = MASK(vMask);
-            for(uint32_t i = 0; i < mVWidth/2; ++i)
-            {
-                // single component byte index
-                Value *offset = VEXTRACT(vOffsets,C(i));
-                // byte pointer to component
-                Value *loadAddress = GEP(pBase,offset);
-                loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
-                // pointer to the value to load if we're masking off a component
-                Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-                Value *selMask = VEXTRACT(mask,C(i));
-                // switch in a safe address to load if we're trying to access a vertex
-                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-                Value *val = LOAD(validAddress);
-                vGather = VINSERT(vGather,val,C(i));
-            }
-            STACKRESTORE(pStack);
-        }
-        return vGather;
+        Value* src = BITCAST(vmask, mSimdInt32Ty);
+        return ICMP_SLT(src, VIMMED1(0));
      }
  
-#if USE_SIMD16_BUILDER
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief
-    Value *Builder::EXTRACT(Value *a2, uint32_t imm)
+    Value* Builder::MASK_16(Value* vmask)
      {
-        const uint32_t i0 = (imm > 0) ? mVWidth : 0;
-
-        Value *result = VUNDEF_F();
-
-        for (uint32_t i = 0; i < mVWidth; i += 1)
-        {
-            Value *temp = VEXTRACT(a2, C(i0 + i));
-
-            result = VINSERT(result, temp, C(i));
-        }
-
-        return result;
+        Value* src = BITCAST(vmask, mSimd16Int32Ty);
+        return ICMP_SLT(src, VIMMED1_16(0));
      }
  
      //////////////////////////////////////////////////////////////////////////
-    /// @brief
-    Value *Builder::INSERT(Value *a2, Value * b, uint32_t imm)
-    {
-        const uint32_t i0 = (imm > 0) ? mVWidth : 0;
-
-        Value *result = BITCAST(a2, mSimd2FP32Ty);
+    /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
+    Value* Builder::VMASK(Value* mask) { return S_EXT(mask, mSimdInt32Ty); }
  
-        for (uint32_t i = 0; i < mVWidth; i += 1)
-        {
-#if 1
-            if (!b->getType()->getScalarType()->isFloatTy())
-            {
-                b = BITCAST(b, mSimdFP32Ty);
-            }
+    Value* Builder::VMASK_16(Value* mask) { return S_EXT(mask, mSimd16Int32Ty); }
  
+    /// @brief Convert <Nxi1> llvm mask to integer
+    Value* Builder::VMOVMSK(Value* mask)
+    {
+#if LLVM_VERSION_MAJOR >= 11
+        VectorType* pVectorType = cast<VectorType>(mask->getType());
+        SWR_ASSERT(pVectorType->getElementType() == mInt1Ty);
+        uint32_t numLanes = pVectorType->getNumElements();
+#else
+        SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty);
+        uint32_t numLanes = mask->getType()->getVectorNumElements();
  #endif
-            Value *temp = VEXTRACT(b, C(i));
-
-            result = VINSERT(result, temp, C(i0 + i));
+        Value*   i32Result;
+        if (numLanes == 8)
+        {
+            i32Result = BITCAST(mask, mInt8Ty);
          }
-
-        return result;
-    }
-
-#endif
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
-    Value* Builder::MASK(Value* vmask)
-    {
-        Value* src = BITCAST(vmask, mSimdInt32Ty);
-        return ICMP_SLT(src, VIMMED1(0));
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
-    Value* Builder::VMASK(Value* mask)
-    {
-        return S_EXT(mask, mSimdInt32Ty);
+        else if (numLanes == 16)
+        {
+            i32Result = BITCAST(mask, mInt16Ty);
+        }
+        else
+        {
+            SWR_ASSERT("Unsupported vector width");
+            i32Result = BITCAST(mask, mInt8Ty);
+        }
+        return Z_EXT(i32Result, mInt32Ty);
      }
  
      //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPSHUFB operation in LLVM IR.  If not  
+    /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
      /// supported on the underlying platform, emulate it
      /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
      /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
-    /// Byte masks in lower 128 lane of b selects 8 bit values from lower 
-    /// 128bits of a, and vice versa for the upper lanes.  If the mask 
+    /// Byte masks in lower 128 lane of b selects 8 bit values from lower
+    /// 128bits of a, and vice versa for the upper lanes.  If the mask
      /// value is negative, '0' is inserted.
-    Value *Builder::PSHUFB(Value* a, Value* b)
+    Value* Builder::PSHUFB(Value* a, Value* b)
      {
          Value* res;
          // use avx2 pshufb instruction if available
-        if(JM()->mArch.AVX2())
+        if (JM()->mArch.AVX2())
          {
              res = VPSHUFB(a, b);
          }
          else
          {
              Constant* cB = dyn_cast<Constant>(b);
+            assert(cB != nullptr);
              // number of 8 bit elements in b
              uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
              // output vector
@@ -783,22 +606,26 @@ namespace SwrJit
  
              // insert an 8 bit value from the high and low lanes of a per loop iteration
              numElms /= 2;
-            for(uint32_t i = 0; i < numElms; i++)
+            for (uint32_t i = 0; i < numElms; i++)
              {
-                ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
+                ConstantInt* cLow128b  = cast<ConstantInt>(cB->getAggregateElement(i));
                  ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
  
                  // extract values from constant mask
-                char valLow128bLane =  (char)(cLow128b->getSExtValue());
+                char valLow128bLane  = (char)(cLow128b->getSExtValue());
                  char valHigh128bLane = (char)(cHigh128b->getSExtValue());
  
                  Value* insertValLow128b;
                  Value* insertValHigh128b;
  
                  // if the mask value is negative, insert a '0' in the respective output position
-                // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
-                insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
-                insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
+                // otherwise, lookup the value at mask position (bits 3..0 of the respective mask
+                // byte) in a and insert in output vector
+                insertValLow128b =
+                    (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
+                insertValHigh128b = (valHigh128bLane < 0)
+                                        ? C((char)0)
+                                        : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
  
                  vShuf = VINSERT(vShuf, insertValLow128b, i);
                  vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
@@ -809,11 +636,11 @@ namespace SwrJit
      }
  
      //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 
+    /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
      /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
-    /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only 
+    /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
      /// lower 8 values are used.
-    Value *Builder::PMOVSXBD(Value* a)
+    Value* Builder::PMOVSXBD(Value* a)
      {
          // VPMOVSXBD output type
          Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
@@ -822,10 +649,10 @@ namespace SwrJit
      }
  
      //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 
+    /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
      /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
      /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
-    Value *Builder::PMOVSXWD(Value* a)
+    Value* Builder::PMOVSXWD(Value* a)
      {
          // VPMOVSXWD output type
          Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
@@ -833,113 +660,28 @@ namespace SwrJit
          return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
      }
  
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPERMD operation (shuffle 32 bit integer values 
-    /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
-    /// platform, emulate it
-    /// @param a - 256bit SIMD lane(8x32bit) of integer values.
-    /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
-    Value *Builder::PERMD(Value* a, Value* idx)
-    {
-        Value* res;
-        // use avx2 permute instruction if available
-        if(JM()->mArch.AVX2())
-        {
-            res = VPERMD(a, idx);
-        }
-        else
-        {
-            if (isa<Constant>(idx))
-            {
-                res = VSHUFFLE(a, a, idx);
-            }
-            else
-            {
-                res = VUNDEF_I();
-                for (uint32_t l = 0; l < JM()->mVWidth; ++l)
-                {
-                    Value* pIndex = VEXTRACT(idx, C(l));
-                    Value* pVal = VEXTRACT(a, pIndex);
-                    res = VINSERT(res, pVal, C(l));
-                }
-            }
-        }
-        return res;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPERMPS operation (shuffle 32 bit float values 
-    /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
-    /// platform, emulate it
-    /// @param a - 256bit SIMD lane(8x32bit) of float values.
-    /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
-    Value *Builder::PERMPS(Value* a, Value* idx)
-    {
-        Value* res;
-        // use avx2 permute instruction if available
-        if (JM()->mArch.AVX2())
-        {
-            // llvm 3.6.0 swapped the order of the args to vpermd
-            res = VPERMPS(idx, a);
-        }
-        else
-        {
-            if (isa<Constant>(idx))
-            {
-                res = VSHUFFLE(a, a, idx);
-            }
-            else
-            {
-                res = VUNDEF_F();
-                for (uint32_t l = 0; l < JM()->mVWidth; ++l)
-                {
-                    Value* pIndex = VEXTRACT(idx, C(l));
-                    Value* pVal = VEXTRACT(a, pIndex);
-                    res = VINSERT(res, pVal, C(l));
-                }
-            }
-        }
-
-        return res;
-    }
-
      //////////////////////////////////////////////////////////////////////////
      /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
      /// in LLVM IR.  If not supported on the underlying platform, emulate it
      /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
-    Value *Builder::CVTPH2PS(Value* a)
+    Value* Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
      {
-        if (JM()->mArch.F16C())
-        {
-            return VCVTPH2PS(a);
-        }
-        else
-        {
-            FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
-            Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
-
-            if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
-            {
-                sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
-            }
-
-            Value* pResult = UndefValue::get(mSimdFP32Ty);
-            for (uint32_t i = 0; i < mVWidth; ++i)
-            {
-                Value* pSrc = VEXTRACT(a, C(i));
-                Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
-                pResult = VINSERT(pResult, pConv, C(i));
-            }
+        // Bitcast Nxint16 to Nxhalf
+#if LLVM_VERSION_MAJOR >= 11
+        uint32_t numElems = cast<VectorType>(a->getType())->getNumElements();
+#else
+        uint32_t numElems = a->getType()->getVectorNumElements();
+#endif
+        Value*   input    = BITCAST(a, VectorType::get(mFP16Ty, numElems));
  
-            return pResult;
-        }
+        return FP_EXT(input, VectorType::get(mFP32Ty, numElems), name);
      }
  
      //////////////////////////////////////////////////////////////////////////
      /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
      /// in LLVM IR.  If not supported on the underlying platform, emulate it
      /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
-    Value *Builder::CVTPS2PH(Value* a, Value* rounding)
+    Value* Builder::CVTPS2PH(Value* a, Value* rounding)
      {
          if (JM()->mArch.F16C())
          {
@@ -948,651 +690,302 @@ namespace SwrJit
          else
          {
              // call scalar C function for now
-            FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
-            Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
+            FunctionType* pFuncTy   = FunctionType::get(mInt16Ty, mFP32Ty);
+            Function*     pCvtPs2Ph = cast<Function>(
+#if LLVM_VERSION_MAJOR >= 9
+                JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy).getCallee());
+#else
+                JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
+#endif
  
              if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
              {
-                sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
+                sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16",
+                                               (void*)&ConvertFloat32ToFloat16);
              }
  
              Value* pResult = UndefValue::get(mSimdInt16Ty);
              for (uint32_t i = 0; i < mVWidth; ++i)
              {
-                Value* pSrc = VEXTRACT(a, C(i));
+                Value* pSrc  = VEXTRACT(a, C(i));
                  Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
-                pResult = VINSERT(pResult, pConv, C(i));
+                pResult      = VINSERT(pResult, pConv, C(i));
              }
  
              return pResult;
          }
      }
  
-    Value *Builder::PMAXSD(Value* a, Value* b)
+    Value* Builder::PMAXSD(Value* a, Value* b)
      {
          Value* cmp = ICMP_SGT(a, b);
          return SELECT(cmp, a, b);
      }
  
-    Value *Builder::PMINSD(Value* a, Value* b)
+    Value* Builder::PMINSD(Value* a, Value* b)
      {
          Value* cmp = ICMP_SLT(a, b);
          return SELECT(cmp, a, b);
      }
  
-    void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, 
-                          Value* mask, Value* vGatherComponents[], bool bPackedOutput)
-    {
-        const SWR_FORMAT_INFO &info = GetFormatInfo(format);
-        if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
-        {
-            // ensure our mask is the correct type
-            mask = BITCAST(mask, mSimdFP32Ty);
-            GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
-        }
-        else
-        {
-            // ensure our mask is the correct type
-            mask = BITCAST(mask, mSimdInt32Ty);
-            GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
-        }
-    }
-
-    void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, 
-                            Value* mask, Value* vGatherComponents[], bool bPackedOutput)
-    {
-        switch(info.bpp / info.numComps)
-        {
-            case 16: 
-            {
-                    Value* vGatherResult[2];
-                    Value *vMask;
-
-                    // TODO: vGatherMaskedVal
-                    Value* vGatherMaskedVal = VIMMED1((float)0);
-
-                    // always have at least one component out of x or y to fetch
-
-                    // save mask as it is zero'd out after each gather
-                    vMask = mask;
-
-                    vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
-                    // e.g. result of first 8x32bit integer gather for 16bit components
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-                    //
-
-                    // if we have at least one component out of x or y to fetch
-                    if(info.numComps > 2)
-                    {
-                        // offset base to the next components(zw) in the vertex to gather
-                        pSrcBase = GEP(pSrcBase, C((char)4));
-                        vMask = mask;
-
-                        vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
-                        // e.g. result of second 8x32bit integer gather for 16bit components
-                        // 256i - 0    1    2    3    4    5    6    7
-                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-                        //
-                    }
-                    else
-                    {
-                        vGatherResult[1] =  vGatherMaskedVal;
-                    }
-
-                    // Shuffle gathered components into place, each row is a component
-                    Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
-            }
-                break;
-            case 32: 
-            { 
-                // apply defaults
-                for (uint32_t i = 0; i < 4; ++i)
-                {
-                    vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
-                }
-
-                for(uint32_t i = 0; i < info.numComps; i++)
-                {
-                    uint32_t swizzleIndex = info.swizzle[i];
-
-                    // save mask as it is zero'd out after each gather
-                    Value *vMask = mask;
-
-                    // Gather a SIMD of components
-                    vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
-
-                    // offset base to the next component to gather
-                    pSrcBase = GEP(pSrcBase, C((char)4));
-                }
-            }
-                break;
-            default:
-                SWR_INVALID("Invalid float format");
-                break;
-        }
-    }
-
-    void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-                            Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+    Value* Builder::PMAXUD(Value* a, Value* b)
      {
-        switch (info.bpp / info.numComps)
-        {
-            case 8:
-            {
-                Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-                Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
-                // e.g. result of an 8x32bit integer gather for 8bit components
-                // 256i - 0    1    2    3    4    5    6    7
-                //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
-
-                Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
-            }
-                break;
-            case 16:
-            {
-                Value* vGatherResult[2];
-                Value *vMask;
-
-                // TODO: vGatherMaskedVal
-                Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-
-                // always have at least one component out of x or y to fetch
-
-                // save mask as it is zero'd out after each gather
-                vMask = mask;
-
-                vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
-                // e.g. result of first 8x32bit integer gather for 16bit components
-                // 256i - 0    1    2    3    4    5    6    7
-                //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-                //
-
-                // if we have at least one component out of x or y to fetch
-                if(info.numComps > 2)
-                {
-                    // offset base to the next components(zw) in the vertex to gather
-                    pSrcBase = GEP(pSrcBase, C((char)4));
-                    vMask = mask;
-
-                    vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
-                    // e.g. result of second 8x32bit integer gather for 16bit components
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-                    //
-                }
-                else
-                {
-                    vGatherResult[1] = vGatherMaskedVal;
-                }
-
-                // Shuffle gathered components into place, each row is a component
-                Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
-
-            }
-                break;
-            case 32:
-            {
-                // apply defaults
-                for (uint32_t i = 0; i < 4; ++i)
-                {
-                    vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
-                }
-
-                for(uint32_t i = 0; i < info.numComps; i++)
-                {
-                    uint32_t swizzleIndex = info.swizzle[i];
-
-                    // save mask as it is zero'd out after each gather
-                    Value *vMask = mask;
-
-                    // Gather a SIMD of components
-                    vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
-
-                    // offset base to the next component to gather
-                    pSrcBase = GEP(pSrcBase, C((char)4));
-                }
-            }
-                break;
-            default:
-                SWR_INVALID("unsupported format");
-            break;
-        }
-    }
-
-    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
-    {
-        // cast types
-        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-        Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
-
-        // input could either be float or int vector; do shuffle work in int
-        vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
-        vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
-
-        if(bPackedOutput) 
-        {
-            Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-
-            // shuffle mask
-            Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
-            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
-            // after pshufb: group components together in each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
-
-            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-            // after PERMD: move and pack xy components into each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-
-            // do the same for zw components
-            Value* vi128ZW = nullptr;
-            if(info.numComps > 2) 
-            {
-                Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
-                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-            }
-
-            for(uint32_t i = 0; i < 4; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-                // todo: fixed for packed
-                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-                if(i >= info.numComps)
-                {
-                    // set the default component val
-                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-                    continue;
-                }
-
-                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
-                // extract packed component 128 bit lanes 
-                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-            }
-
-        }
-        else 
-        {
-            // pshufb masks for each component
-            Value* vConstMask[2];
-            // x/z shuffle mask
-            vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-                                     0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
-
-            // y/w shuffle mask
-            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
-
-
-            // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
-            // apply defaults
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
-            }
-
-            for(uint32_t i = 0; i < info.numComps; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-
-                // select correct constMask for x/z or y/w pshufb
-                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                uint32_t selectedGather = (i < 2) ? 0 : 1;
-
-                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
-                // after pshufb mask for x channel; z uses the same shuffle from the second gather
-                // 256i - 0    1    2    3    4    5    6    7
-                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
-            }
-        }
+        Value* cmp = ICMP_UGT(a, b);
+        return SELECT(cmp, a, b);
      }
  
-    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
+    Value* Builder::PMINUD(Value* a, Value* b)
      {
-        // cast types
-        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-        Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
-
-        if(bPackedOutput)
-        {
-            Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-            // shuffle mask
-            Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
-                                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
-            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-            // after pshufb: group components together in each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
-
-            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
-            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
-            // 256i - 0    1    2    3    4    5    6    7
-            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
-
-            // do the same for zw components
-            Value* vi128ZW = nullptr;
-            if(info.numComps > 2) 
-            {
-                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
-            }
-
-            // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
-            for(uint32_t i = 0; i < 4; i++)
-            {
-                uint32_t swizzleIndex = info.swizzle[i];
-                // todo: fix for packed
-                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-                if(i >= info.numComps)
-                {
-                    // set the default component val
-                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-                    continue;
-                }
-
-                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 
-                // if x or y, use vi128XY permute result, else use vi128ZW
-                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-            
-                // sign extend
-                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-            }
-        }
-        // else zero extend
-        else{
-            // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
-            // apply defaults
-            for (uint32_t i = 0; i < 4; ++i)
-            {
-                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
-            }
-
-            for(uint32_t i = 0; i < info.numComps; i++){
-                uint32_t swizzleIndex = info.swizzle[i];
-
-                // pshufb masks for each component
-                Value* vConstMask;
-                switch(i)
-                {
-                    case 0:
-                        // x shuffle mask
-                        vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
-                                              0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
-                        break;
-                    case 1:
-                        // y shuffle mask
-                        vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
-                                              1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
-                        break;
-                    case 2:
-                        // z shuffle mask
-                        vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
-                                              2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
-                        break;
-                    case 3:
-                        // w shuffle mask
-                        vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
-                                              3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
-                        break;
-                    default:
-                        vConstMask = nullptr;
-                        break;
-                }
-
-                    vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-                    // after pshufb for x channel
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        x000 x000 x000 x000 x000 x000 x000 x000 
-            }
-        }
+        Value* cmp = ICMP_ULT(a, b);
+        return SELECT(cmp, a, b);
      }
  
      // Helper function to create alloca in entry block of function
      Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
      {
          auto saveIP = IRB()->saveIP();
-        IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
-                              pFunc->getEntryBlock().begin());
+        IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
          Value* pAlloca = ALLOCA(pType);
-        if (saveIP.isSet()) IRB()->restoreIP(saveIP);
+        if (saveIP.isSet())
+            IRB()->restoreIP(saveIP);
          return pAlloca;
      }
  
      Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
      {
          auto saveIP = IRB()->saveIP();
-        IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
-            pFunc->getEntryBlock().begin());
+        IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
          Value* pAlloca = ALLOCA(pType, pArraySize);
-        if (saveIP.isSet()) IRB()->restoreIP(saveIP);
+        if (saveIP.isSet())
+            IRB()->restoreIP(saveIP);
          return pAlloca;
      }
  
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief emulates a scatter operation.
-    /// @param pDst - pointer to destination 
-    /// @param vSrc - vector of src data to scatter
-    /// @param vOffsets - vector of byte offsets from pDst
-    /// @param vMask - mask of valid lanes
-    void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
+    Value* Builder::VABSPS(Value* a)
      {
-        /* Scatter algorithm
-    
-           while(Index = BitScanForward(mask))
-                srcElem = srcVector[Index]
-                offsetElem = offsetVector[Index]
-                *(pDst + offsetElem) = srcElem
-                Update mask (&= ~(1<<Index)
-
-        */
-
-        BasicBlock* pCurBB = IRB()->GetInsertBlock();
-        Function* pFunc = pCurBB->getParent();
-        Type* pSrcTy = vSrc->getType()->getVectorElementType();
-
-        // Store vectors on stack
-        if (pScatterStackSrc == nullptr)
-        {
-            // Save off stack allocations and reuse per scatter. Significantly reduces stack
-            // requirements for shaders with a lot of scatters.
-            pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
-            pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
-        }
-    
-        Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
-        Value* pOffsetsArrayPtr = pScatterStackOffsets;
-        STORE(vSrc, pSrcArrayPtr);
-        STORE(vOffsets, pOffsetsArrayPtr);
+        Value* asInt  = BITCAST(a, mSimdInt32Ty);
+        Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
+        return result;
+    }
  
-        // Cast to pointers for random access
-        pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
-        pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
+    Value* Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
+    {
+        Value* lowCmp = ICMP_SLT(src, low);
+        Value* ret    = SELECT(lowCmp, low, src);
  
-        Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
+        Value* highCmp = ICMP_SGT(ret, high);
+        ret            = SELECT(highCmp, high, ret, name);
  
-        // Get cttz function
-        Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
-    
-        // Setup loop basic block
-        BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
+        return ret;
+    }
  
-        // compute first set bit
-        Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
+    Value* Builder::FCLAMP(Value* src, Value* low, Value* high)
+    {
+        Value* lowCmp = FCMP_OLT(src, low);
+        Value* ret    = SELECT(lowCmp, low, src);
  
-        Value* pIsUndef = ICMP_EQ(pIndex, C(32));
+        Value* highCmp = FCMP_OGT(ret, high);
+        ret            = SELECT(highCmp, high, ret);
  
-        // Split current block
-        BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
+        return ret;
+    }
  
-        // Remove unconditional jump created by splitBasicBlock
-        pCurBB->getTerminator()->eraseFromParent();
+    Value* Builder::FCLAMP(Value* src, float low, float high)
+    {
+        Value* result = VMAXPS(src, VIMMED1(low));
+        result        = VMINPS(result, VIMMED1(high));
  
-        // Add terminator to end of original block
-        IRB()->SetInsertPoint(pCurBB);
+        return result;
+    }
  
-        // Add conditional branch
-        COND_BR(pIsUndef, pPostLoop, pLoop);
+    Value* Builder::FMADDPS(Value* a, Value* b, Value* c)
+    {
+        Value* vOut;
+        // This maps to LLVM fmuladd intrinsic
+        vOut = VFMADDPS(a, b, c);
+        return vOut;
+    }
  
-        // Add loop basic block contents
-        IRB()->SetInsertPoint(pLoop);
-        PHINode* pIndexPhi = PHI(mInt32Ty, 2);
-        PHINode* pMaskPhi = PHI(mInt32Ty, 2);
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief pop count on vector mask (e.g. <8 x i1>)
+    Value* Builder::VPOPCNT(Value* a) { return POPCNT(VMOVMSK(a)); }
  
-        pIndexPhi->addIncoming(pIndex, pCurBB);
-        pMaskPhi->addIncoming(pMask, pCurBB);
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Float / Fixed-point conversions
+    //////////////////////////////////////////////////////////////////////////
+    Value* Builder::VCVT_F32_FIXED_SI(Value*             vFloat,
+                                      uint32_t           numIntBits,
+                                      uint32_t           numFracBits,
+                                      const llvm::Twine& name)
+    {
+        SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
+        Value* fixed = nullptr;
  
-        // Extract elements for this index
-        Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
-        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
+#if 0   // This doesn't work for negative numbers!!
+        {
+            fixed = FP_TO_SI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
+                                    C(_MM_FROUND_TO_NEAREST_INT)),
+                             mSimdInt32Ty);
+        }
+        else
+#endif
+        {
+            // Do round to nearest int on fractional bits first
+            // Not entirely perfect for negative numbers, but close enough
+            vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
+                            C(_MM_FROUND_TO_NEAREST_INT));
+            vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits)));
  
-        // GEP to this offset in dst
-        Value* pCurDst = GEP(pDst, pOffsetElem);
-        pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
-        STORE(pSrcElem, pCurDst);
+            // TODO: Handle INF, NAN, overflow / underflow, etc.
  
-        // Update the mask
-        Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
+            Value* vSgn      = FCMP_OLT(vFloat, VIMMED1(0.0f));
+            Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty);
+            Value* vFixed    = AND(vFloatInt, VIMMED1((1 << 23) - 1));
+            vFixed           = OR(vFixed, VIMMED1(1 << 23));
+            vFixed           = SELECT(vSgn, NEG(vFixed), vFixed);
  
-        // Terminator
-        Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
+            Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24));
+            vExp        = SUB(vExp, VIMMED1(127));
  
-        pIsUndef = ICMP_EQ(pNewIndex, C(32));
-        COND_BR(pIsUndef, pPostLoop, pLoop);
+            Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp);
  
-        // Update phi edges
-        pIndexPhi->addIncoming(pNewIndex, pLoop);
-        pMaskPhi->addIncoming(pNewMask, pLoop);
+            fixed = ASHR(vFixed, vExtraBits, name);
+        }
  
-        // Move builder to beginning of post loop
-        IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
+        return fixed;
      }
  
-    Value* Builder::VABSPS(Value* a)
+    Value* Builder::VCVT_FIXED_SI_F32(Value*             vFixed,
+                                      uint32_t           numIntBits,
+                                      uint32_t           numFracBits,
+                                      const llvm::Twine& name)
      {
-        Value* asInt = BITCAST(a, mSimdInt32Ty);
-        Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
-        return result;
-    }
+        SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
+        uint32_t extraBits = 32 - numIntBits - numFracBits;
+        if (numIntBits && extraBits)
+        {
+            // Sign extend
+            Value* shftAmt = VIMMED1(extraBits);
+            vFixed         = ASHR(SHL(vFixed, shftAmt), shftAmt);
+        }
  
-    Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
-    {
-        Value *lowCmp = ICMP_SLT(src, low);
-        Value *ret = SELECT(lowCmp, low, src);
+        Value* fVal  = VIMMED1(0.0f);
+        Value* fFrac = VIMMED1(0.0f);
+        if (numIntBits)
+        {
+            fVal = SI_TO_FP(ASHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name);
+        }
  
-        Value *highCmp = ICMP_SGT(ret, high);
-        ret = SELECT(highCmp, high, ret);
+        if (numFracBits)
+        {
+            fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty);
+            fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name);
+        }
  
-        return ret;
+        return FADD(fVal, fFrac, name);
      }
  
-    Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
+    Value* Builder::VCVT_F32_FIXED_UI(Value*             vFloat,
+                                      uint32_t           numIntBits,
+                                      uint32_t           numFracBits,
+                                      const llvm::Twine& name)
      {
-        Value *lowCmp = FCMP_OLT(src, low);
-        Value *ret = SELECT(lowCmp, low, src);
+        SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
+        Value* fixed = nullptr;
+#if 1   // KNOB_SIM_FAST_MATH?  Below works correctly from a precision
+        // standpoint...
+        {
+            fixed = FP_TO_UI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
+                                    C(_MM_FROUND_TO_NEAREST_INT)),
+                             mSimdInt32Ty);
+        }
+#else
+        {
+            // Do round to nearest int on fractional bits first
+            vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))),
+                            C(_MM_FROUND_TO_NEAREST_INT));
+            vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits)));
  
-        Value *highCmp = FCMP_OGT(ret, high);
-        ret = SELECT(highCmp, high, ret);
+            // TODO: Handle INF, NAN, overflow / underflow, etc.
  
-        return ret;
-    }
+            Value* vSgn      = FCMP_OLT(vFloat, VIMMED1(0.0f));
+            Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty);
+            Value* vFixed    = AND(vFloatInt, VIMMED1((1 << 23) - 1));
+            vFixed           = OR(vFixed, VIMMED1(1 << 23));
  
-    Value *Builder::FCLAMP(Value* src, float low, float high)
-    {
-        Value* result = VMAXPS(src, VIMMED1(low));
-        result = VMINPS(result, VIMMED1(high));
+            Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24));
+            vExp        = SUB(vExp, VIMMED1(127));
  
-        return result;
-    }
+            Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp);
  
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief save/restore stack, providing ability to push/pop the stack and 
-    ///        reduce overall stack requirements for temporary stack use
-    Value* Builder::STACKSAVE()
-    {
-        Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
-        return CALLA(pfnStackSave);
+            fixed = LSHR(vFixed, vExtraBits, name);
+        }
+#endif
+        return fixed;
      }
  
-    void Builder::STACKRESTORE(Value* pSaved)
+    Value* Builder::VCVT_FIXED_UI_F32(Value*             vFixed,
+                                      uint32_t           numIntBits,
+                                      uint32_t           numFracBits,
+                                      const llvm::Twine& name)
      {
-        Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
-        CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
-    }
+        SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values");
+        uint32_t extraBits = 32 - numIntBits - numFracBits;
+        if (numIntBits && extraBits)
+        {
+            // Sign extend
+            Value* shftAmt = VIMMED1(extraBits);
+            vFixed         = ASHR(SHL(vFixed, shftAmt), shftAmt);
+        }
  
-    Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
-    {
-        Value* vOut;
-        // use FMADs if available
-        if(JM()->mArch.AVX2())
+        Value* fVal  = VIMMED1(0.0f);
+        Value* fFrac = VIMMED1(0.0f);
+        if (numIntBits)
          {
-            vOut = VFMADDPS(a, b, c);
+            fVal = UI_TO_FP(LSHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name);
          }
-        else
+
+        if (numFracBits)
          {
-            vOut = FADD(FMUL(a, b), c);
+            fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty);
+            fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name);
          }
-        return vOut;
-    }
  
-    Value* Builder::POPCNT(Value* a)
-    {
-        Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
-        return CALL(pCtPop, std::initializer_list<Value*>{a});
+        return FADD(fVal, fFrac, name);
      }
  
      //////////////////////////////////////////////////////////////////////////
      /// @brief C functions called by LLVM IR
      //////////////////////////////////////////////////////////////////////////
  
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief called in JIT code, inserted by PRINT
-    /// output to both stdout and visual studio debug console
-    void __cdecl CallPrint(const char* fmt, ...)
+    Value* Builder::VEXTRACTI128(Value* a, Constant* imm8)
      {
-        va_list args;
-        va_start(args, fmt);
-        vprintf(fmt, args);
-
-    #if defined( _WIN32 )
-        char strBuf[1024];
-        vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
-        OutputDebugStringA(strBuf);
-    #endif
-
-        va_end(args);
-    }
-
-    Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
-    {
-        bool flag = !imm8->isZeroValue();
-        SmallVector<Constant*,8> idx;
-        for (unsigned i = 0; i < mVWidth / 2; i++) {
+        bool                      flag = !imm8->isZeroValue();
+        SmallVector<Constant*, 8> idx;
+        for (unsigned i = 0; i < mVWidth / 2; i++)
+        {
              idx.push_back(C(flag ? i + mVWidth / 2 : i));
          }
          return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
      }
  
-    Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
+    Value* Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
      {
-        bool flag = !imm8->isZeroValue();
-        SmallVector<Constant*,8> idx;
-        for (unsigned i = 0; i < mVWidth; i++) {
+        bool                      flag = !imm8->isZeroValue();
+        SmallVector<Constant*, 8> idx;
+        for (unsigned i = 0; i < mVWidth; i++)
+        {
              idx.push_back(C(i));
          }
-        Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
+        Value* inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
  
-        SmallVector<Constant*,8> idx2;
-        for (unsigned i = 0; i < mVWidth / 2; i++) {
+        SmallVector<Constant*, 8> idx2;
+        for (unsigned i = 0; i < mVWidth / 2; i++)
+        {
              idx2.push_back(C(flag ? i : i + mVWidth));
          }
-        for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
+        for (unsigned i = mVWidth / 2; i < mVWidth; i++)
+        {
              idx2.push_back(C(flag ? i + mVWidth / 2 : i));
          }
          return VSHUFFLE(a, inter, ConstantVector::get(idx2));
@@ -1601,62 +994,75 @@ namespace SwrJit
      // rdtsc buckets macros
      void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
      {
-        // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
-        // buckets framework when single threaded
+        // @todo due to an issue with thread local storage propagation in llvm, we can only safely
+        // call into buckets framework when single threaded
          if (KNOB_SINGLE_THREADED)
          {
              std::vector<Type*> args{
-                PointerType::get(mInt32Ty, 0),   // pBucketMgr
-                mInt32Ty                        // id
+                PointerType::get(mInt32Ty, 0), // pBucketMgr
+                mInt32Ty                       // id
              };
  
              FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
-            Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
-            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
+            Function*     pFunc   = cast<Function>(
+#if LLVM_VERSION_MAJOR >= 9
+                JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy).getCallee());
+#else
+                JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
+#endif
+            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") ==
+                nullptr)
              {
-                sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
+                sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket",
+                                               (void*)&BucketManager_StartBucket);
              }
  
-            CALL(pFunc, { pBucketMgr, pId });
+            CALL(pFunc, {pBucketMgr, pId});
          }
      }
  
      void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
      {
-        // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
-        // buckets framework when single threaded
+        // @todo due to an issue with thread local storage propagation in llvm, we can only safely
+        // call into buckets framework when single threaded
          if (KNOB_SINGLE_THREADED)
          {
              std::vector<Type*> args{
-                PointerType::get(mInt32Ty, 0),   // pBucketMgr
-                mInt32Ty                        // id
+                PointerType::get(mInt32Ty, 0), // pBucketMgr
+                mInt32Ty                       // id
              };
  
              FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
-            Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
-            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
+            Function*     pFunc   = cast<Function>(
+#if LLVM_VERSION_MAJOR >= 9
+                JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy).getCallee());
+#else
+                JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
+#endif
+            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") ==
+                nullptr)
              {
-                sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
+                sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket",
+                                               (void*)&BucketManager_StopBucket);
              }
  
-            CALL(pFunc, { pBucketMgr, pId });
+            CALL(pFunc, {pBucketMgr, pId});
          }
      }
  
-
      uint32_t Builder::GetTypeSize(Type* pType)
      {
          if (pType->isStructTy())
          {
              uint32_t numElems = pType->getStructNumElements();
-            Type* pElemTy = pType->getStructElementType(0);
+            Type*    pElemTy  = pType->getStructElementType(0);
              return numElems * GetTypeSize(pElemTy);
          }
  
          if (pType->isArrayTy())
          {
              uint32_t numElems = pType->getArrayNumElements();
-            Type* pElemTy = pType->getArrayElementType();
+            Type*    pElemTy  = pType->getArrayElementType();
              return numElems * GetTypeSize(pElemTy);
          }
  
@@ -1684,4 +1090,4 @@ namespace SwrJit
          SWR_ASSERT(false, "Unimplemented type.");
          return 0;
      }
-}
+} // namespace SwrJit