Revert "swr/rast: Archrast codegen updates"

[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simdlib_512_emu.inl
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl

index d6af7b1c64169c0eac8d12b18a34079eadf76964..f25d834725c05955088a238d82fdac02e3f6944b 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -1,25 +1,25 @@
  /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
  #if !defined(__SIMD_LIB_AVX_HPP__)
  #error Do not include this file directly, use "simdlib.hpp" instead.
  #endif
@@ -29,149 +29,143 @@
  //============================================================================
  
  static const int TARGET_SIMD_WIDTH = 8;
-using SIMD128T = SIMD128Impl::AVXImpl;
-
-#define SIMD_WRAPPER_1(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float const &a)   \
-    {\
-        return Float\
-        {\
-            SIMD256T::op(a.v8[0]),\
-            SIMD256T::op(a.v8[1]),\
-        };\
+using SIMD128T                     = SIMD128Impl::AVXImpl;
+
+#define SIMD_WRAPPER_1(op)                              \
+    static SIMDINLINE Float SIMDCALL op(Float const& a) \
+    {                                                   \
+        return Float{                                   \
+            SIMD256T::op(a.v8[0]),                      \
+            SIMD256T::op(a.v8[1]),                      \
+        };                                              \
      }
  
-#define SIMD_WRAPPER_2(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)    \
-    {\
-        return Float\
-        {\
-            SIMD256T::op(a.v8[0], b.v8[0]),\
-            SIMD256T::op(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_WRAPPER_2(op)                                              \
+    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
+    {                                                                   \
+        return Float{                                                   \
+            SIMD256T::op(a.v8[0], b.v8[0]),                             \
+            SIMD256T::op(a.v8[1], b.v8[1]),                             \
+        };                                                              \
      }
  
-#define SIMD_WRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
-    {\
-        return Float\
-        {\
-            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
-            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_WRAPPER_2I(op)                                                              \
+    template <int ImmT>                                                                  \
+    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b)                  \
+    {                                                                                    \
+        return Float{                                                                    \
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
+        };                                                                               \
      }
  
-#define SIMD_WRAPPER_2I_1(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
-    {\
-        return Float\
-        {\
-            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
-            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_WRAPPER_2I_1(op)                                           \
+    template <int ImmT>                                                 \
+    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
+    {                                                                   \
+        return Float{                                                   \
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),              \
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),              \
+        };                                                              \
      }
  
-#define SIMD_WRAPPER_3(op)  \
-        static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c)   \
-        {\
-            return Float\
-            {\
-                SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
-                SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
-            };\
-        }
-
-#define SIMD_IWRAPPER_1(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::op(a.v8[0]),\
-            SIMD256T::op(a.v8[1]),\
-        };\
+#define SIMD_WRAPPER_3(op)                                                              \
+    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
+    {                                                                                   \
+        return Float{                                                                   \
+            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                    \
+            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                    \
+        };                                                                              \
      }
  
-#define SIMD_IWRAPPER_2(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::op(a.v8[0], b.v8[0]),\
-            SIMD256T::op(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_IWRAPPER_1(op)                                 \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
+    {                                                       \
+        return Integer{                                     \
+            SIMD256T::op(a.v8[0]),                          \
+            SIMD256T::op(a.v8[1]),                          \
+        };                                                  \
      }
  
-#define SIMD_IWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
-            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_IWRAPPER_2(op)                                                   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return Integer{                                                       \
+            SIMD256T::op(a.v8[0], b.v8[0]),                                   \
+            SIMD256T::op(a.v8[1], b.v8[1]),                                   \
+        };                                                                    \
      }
  
-#define SIMD_IWRAPPER_2I_1(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
-            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_IWRAPPER_2I(op)                                                             \
+    template <int ImmT>                                                                  \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b)            \
+    {                                                                                    \
+        return Integer{                                                                  \
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
+        };                                                                               \
      }
  
-#define SIMD_IWRAPPER_2I_2(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
-            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_IWRAPPER_2I_1(op)                                                \
+    template <int ImmT>                                                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return Integer{                                                       \
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),                    \
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),                    \
+        };                                                                    \
      }
  
-#define SIMD_IWRAPPER_3(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
-            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
-        };\
+#define SIMD_IWRAPPER_2I_2(op)                                                \
+    template <int ImmT>                                                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return Integer{                                                       \
+            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),              \
+            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),       \
+        };                                                                    \
+    }
+
+#define SIMD_IWRAPPER_3(op)                                                                     \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
+    {                                                                                           \
+        return Integer{                                                                         \
+            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                            \
+            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                            \
+        };                                                                                      \
      }
  
  //-----------------------------------------------------------------------
  // Single precision floating point arithmetic operations
  //-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);     // return a + b
-SIMD_WRAPPER_2(div_ps);     // return a / b
-SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);     // return a * b
-SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
-SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);     // return a - b
+SIMD_WRAPPER_2(add_ps);   // return a + b
+SIMD_WRAPPER_2(div_ps);   // return a / b
+SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);   // return a * b
+SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);   // return a - b
  
  template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float const &a)
+static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
  {
-    return Float
-    {
+    return Float{
          SIMD256T::template round_ps<RMT>(a.v8[0]),
          SIMD256T::template round_ps<RMT>(a.v8[1]),
      };
  }
  
-static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
-static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
+{
+    return round_ps<RoundMode::CEIL_NOEXC>(a);
+}
+static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
+{
+    return round_ps<RoundMode::FLOOR_NOEXC>(a);
+}
  
  //-----------------------------------------------------------------------
  // Integer (various width) arithmetic operations
@@ -179,7 +173,7 @@ static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<Roun
  SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
  SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
  SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
  SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
  SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
  SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
@@ -207,178 +201,168 @@ SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
  SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
  SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
  
-
  //-----------------------------------------------------------------------
  // Shift operations
  //-----------------------------------------------------------------------
-template<int ImmT>
-static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const &a)      // return a << ImmT
+template <int ImmT>
+static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
          SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
      };
  }
  
-SIMD_IWRAPPER_2(sllv_epi32);                                // return a << b      (uint32)
+SIMD_IWRAPPER_2(sllv_epi32); // return a << b      (uint32)
  
-template<int ImmT>
-static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const &a)      // return a >> ImmT   (int32)
+template <int ImmT>
+static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT   (int32)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
          SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
      };
  }
  
-template<int ImmT>
-static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const &a)      // return a >> ImmT   (uint32)
+template <int ImmT>
+static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT   (uint32)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
          SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
      };
  }
  
-template<int ImmT>                                          // for each 128-bit lane:
-static SIMDINLINE Integer SIMDCALL srli_si(Integer const &a)         //  return a >> (ImmT*8) (uint)
+template <int ImmT>                                          // for each 128-bit lane:
+static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) //  return a >> (ImmT*8) (uint)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::template srli_si<ImmT>(a.v8[0]),
          SIMD256T::template srli_si<ImmT>(a.v8[1]),
      };
  }
-template<int ImmT>
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)       // same as srli_si, but with Float cast to int
+template <int ImmT>
+static SIMDINLINE Float SIMDCALL
+                        srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int
  {
-    return Float
-    {
+    return Float{
          SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
          SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
      };
  }
  
-SIMD_IWRAPPER_2(srlv_epi32);                                // return a >> b      (uint32)
+SIMD_IWRAPPER_2(srlv_epi32); // return a >> b      (uint32)
  
  //-----------------------------------------------------------------------
  // Conversion operations
  //-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a)              // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
  {
-    return Float
-    {
+    return Float{
          SIMD256T::castpd_ps(a.v8[0]),
          SIMD256T::castpd_ps(a.v8[1]),
      };
  }
  
-static SIMDINLINE Integer SIMDCALL castps_si(Float const &a)              // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::castps_si(a.v8[0]),
          SIMD256T::castps_si(a.v8[1]),
      };
  }
  
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a)              // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
  {
-    return Double
-    {
+    return Double{
          SIMD256T::castsi_pd(a.v8[0]),
          SIMD256T::castsi_pd(a.v8[1]),
      };
  }
  
-static SIMDINLINE Double SIMDCALL castps_pd(Float const &a)   // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
  {
-    return Double
-    {
+    return Double{
          SIMD256T::castps_pd(a.v8[0]),
          SIMD256T::castps_pd(a.v8[1]),
      };
  }
  
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a)              // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
  {
-    return Float
-    {
+    return Float{
          SIMD256T::castsi_ps(a.v8[0]),
          SIMD256T::castsi_ps(a.v8[1]),
      };
  }
  
-static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a)            // return (float)a    (int32 --> float)
+static SIMDINLINE Float SIMDCALL
+                        cvtepi32_ps(Integer const& a) // return (float)a    (int32 --> float)
  {
-    return Float
-    {
+    return Float{
          SIMD256T::cvtepi32_ps(a.v8[0]),
          SIMD256T::cvtepi32_ps(a.v8[1]),
      };
  }
  
-static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer const &a)          // return (int16)a    (uint8 --> int16)
+static SIMDINLINE Integer SIMDCALL
+                          cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a    (uint8 --> int16)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::cvtepu8_epi16(a.v4[0]),
          SIMD256T::cvtepu8_epi16(a.v4[1]),
      };
  }
  
-static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer const &a)          // return (int32)a    (uint8 --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint8 --> int32)
  {
-    return Integer
-       {
+    return Integer{
          SIMD256T::cvtepu8_epi32(a.v4[0]),
          SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
-       };
+    };
  }
  
-static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer const &a)         // return (int32)a    (uint16 --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint16 --> int32)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::cvtepu16_epi32(a.v4[0]),
          SIMD256T::cvtepu16_epi32(a.v4[1]),
      };
  }
  
-static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer const &a)         // return (int64)a    (uint16 --> int64)
+static SIMDINLINE Integer SIMDCALL
+                          cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint16 --> int64)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::cvtepu16_epi64(a.v4[0]),
          SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
      };
  }
  
-static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer const &a)         // return (int64)a    (uint32 --> int64)
+static SIMDINLINE Integer SIMDCALL
+                          cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint32 --> int64)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::cvtepu32_epi64(a.v4[0]),
          SIMD256T::cvtepu32_epi64(a.v4[1]),
      };
  }
  
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a)            // return (int32)a    (float --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvtps_epi32(Float const& a) // return (int32)a    (float --> int32)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::cvtps_epi32(a.v8[0]),
          SIMD256T::cvtps_epi32(a.v8[1]),
      };
  }
  
-static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvttps_epi32(Float const& a) // return (int32)a    (rnd_to_zero(float) --> int32)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::cvtps_epi32(a.v8[0]),
          SIMD256T::cvtps_epi32(a.v8[1]),
      };
@@ -387,126 +371,144 @@ static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a)           // ret
  //-----------------------------------------------------------------------
  // Comparison operations
  //-----------------------------------------------------------------------
-template<CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b
+template <CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
  {
-    return Float
-    {
+    return Float{
          SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
          SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
      };
  }
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::LT_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::GT_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::NEQ_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::EQ_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::GE_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::LE_OQ>(a, b);
+}
  
-template<CompareType CmpTypeT>
-static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const &a, Float const &b)
+template <CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b)
  {
      return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
  }
  
+SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
  
-SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
-SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
-SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
-SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
-SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
-SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
-SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
-SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
-SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+static SIMDINLINE bool SIMDCALL
+                       testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
  {
-    return  0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
-                  SIMD256T::testz_ps(a.v8[1], b.v8[1]));
+    return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1]));
  }
  
-static SIMDINLINE int SIMDCALL testz_si(Integer const &a, Integer const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+static SIMDINLINE bool SIMDCALL
+                       testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
  {
-    return  0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
-                  SIMD256T::testz_si(a.v8[1], b.v8[1]));
+    return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1]));
  }
  
  //-----------------------------------------------------------------------
  // Blend / shuffle / permute operations
  //-----------------------------------------------------------------------
-SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
-SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
-SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int)
-{
-    return Integer
-    {
+SIMD_WRAPPER_2I(blend_ps);     // return ImmT ? b : a  (float)
+SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);     // return mask ? b : a  (float)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
+                                                Integer const& b,
+                                                Float const&   mask) // return mask ? b : a (int)
+{
+    return Integer{
          SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
          SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
      };
  }
  
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
+                                                Integer const& b,
+                                                Integer const& mask) // return mask ? b : a (int)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
          SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
      };
  }
  
-static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)         // return *p (all elements in vector get same value)
+static SIMDINLINE Float SIMDCALL
+                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
  {
      float f = *p;
-    return Float
-    {
+    return Float{
          SIMD256T::set1_ps(f),
          SIMD256T::set1_ps(f),
      };
  }
  
-template<int imm>
-static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const &a)
+template <int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a)
  {
      SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
      return a.v8[imm];
  }
  
-template<int imm>
-static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const &a)
+template <int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a)
  {
      SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
      return a.v8[imm];
  }
  
-template<int imm>
-static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const &a)
+template <int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a)
  {
      SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
      return a.v8[imm];
  }
  
-template<int imm>
-static SIMDINLINE Float SIMDCALL insert_ps(Float const &a, SIMD256Impl::Float const &b)
+template <int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b)
  {
      SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    Float r = a;
+    Float r   = a;
      r.v8[imm] = b;
      return r;
  }
  
-template<int imm>
-static SIMDINLINE Double SIMDCALL insert_pd(Double const &a, SIMD256Impl::Double const &b)
+template <int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b)
  {
      SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    Double r = a;
+    Double r  = a;
      r.v8[imm] = b;
      return r;
  }
  
-template<int imm>
-static SIMDINLINE Integer SIMDCALL insert_si(Integer const &a, SIMD256Impl::Integer const &b)
+template <int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b)
  {
      SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
      Integer r = a;
@@ -514,43 +516,43 @@ static SIMDINLINE Integer SIMDCALL insert_si(Integer const &a, SIMD256Impl::Inte
      return r;
  }
  
-SIMD_IWRAPPER_2(packs_epi16);      // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);      // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16);     // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32);     // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
  
-static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+template <int ImmT>
+static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
  {
-    Integer result;
-
-    // Ugly slow implementation
-    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
-    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
-
-    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-    {
-        pResult[i] = pA[0xF & pSwiz[i]];
-    }
+    return Float{
+        SIMD256T::template permute_ps<ImmT>(a.v8[0]),
+        SIMD256T::template permute_ps<ImmT>(a.v8[1]),
+    };
+}
  
-    return result;
+static SIMDINLINE Integer SIMDCALL permute_epi32(
+    Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+    return castps_si(permute_ps(castsi_ps(a), swiz));
  }
  
-static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL
+                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
  {
-    Float result;
+    const auto mask = SIMD256T::set1_epi32(7);
  
-    // Ugly slow implementation
-    float const *pA = reinterpret_cast<float const*>(&a);
-    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
-    float *pResult = reinterpret_cast<float *>(&result);
+    auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
+    auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
  
-    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-    {
-        pResult[i] = pA[0xF & pSwiz[i]];
-    }
+    auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
+    auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
  
-    return result;
+    return Float{
+        SIMD256T::blendv_ps(
+            lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
+        SIMD256T::blendv_ps(
+            hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
+    };
  }
  
  // All of the 512-bit permute2f128_XX intrinsics do the following:
@@ -564,7 +566,7 @@ static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)
  //              ESAC
  //              RETURN tmp[127:0]
  //      }
-//      
+//
  //      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
  //      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
  //      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
@@ -576,33 +578,36 @@ static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)
  // AVX instructions for emulation.
  //
  template <int shuf>
-static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const &a, Float const &b)
+static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b)
  {
-    return Float
-    {
-        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
-        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    return Float{
+        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
+                                                                                        a.v8[1]),
+        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
+                                                                                        b.v8[1]),
      };
  }
  
  template <int shuf>
-static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const &a, Double const &b)
+static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b)
  {
-    return Double
-    {
-        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
-        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    return Double{
+        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
+                                                                                        a.v8[1]),
+        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
+                                                                                        b.v8[1]),
      };
  }
  
  template <int shuf>
-static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const &a, Integer const &b)
+static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b)
  {
-    return Integer
-       {
-        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
-        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
-       };
+    return Integer{
+        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
+                                                                                        a.v8[1]),
+        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
+                                                                                        b.v8[1]),
+    };
  }
  
  SIMD_IWRAPPER_2I_1(shuffle_epi32);
@@ -626,209 +631,213 @@ SIMD_WRAPPER_2(unpacklo_ps);
  //-----------------------------------------------------------------------
  // Load / store operations
  //-----------------------------------------------------------------------
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
+static SIMDINLINE Float SIMDCALL
+                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  {
-    return Float
-    {
+    return Float{
          SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
          SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
      };
  }
  
-static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
+static SIMDINLINE Float SIMDCALL
+                        sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return Float{
+        SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]),
+        SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL
+                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
  {
      return broadcast_ss(p);
  }
  
-static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+static SIMDINLINE Float SIMDCALL
+                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
  {
-    return Float
-    {
-        SIMD256T::load_ps(p),
-        SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
-    };
+    return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)};
  }
  
-static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::load_si(&p->v8[0]),
          SIMD256T::load_si(&p->v8[1]),
      };
  }
  
-static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+static SIMDINLINE Float SIMDCALL
+                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
  {
-    return Float
-    {
-        SIMD256T::loadu_ps(p),
-        SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
-    };
+    return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)};
  }
  
-static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+static SIMDINLINE Integer SIMDCALL
+                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
  {
-    return Integer
-    {
+    return Integer{
          SIMD256T::loadu_si(&p->v8[0]),
          SIMD256T::loadu_si(&p->v8[1]),
      };
  }
  
  // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
+static SIMDINLINE Float SIMDCALL
+                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
  {
-    return Float
-    {
+    return Float{
          SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
          SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
      };
  }
  
-static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src)
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
+static SIMDINLINE Float SIMDCALL
+                        sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
+{
+    return Float{
+        SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
+        SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
  {
      SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
      SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
  }
  
-static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const &a)
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a)
  {
      uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
-             mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
+    mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
  
      return mask;
  }
  
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a)
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
  {
      uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
-             mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
+    mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
  
      return mask;
  }
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a)
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
  {
      uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
-             mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
+    mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
  
      return mask;
  }
  
  static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
  {
-    return Integer
-    {
-        SIMD256T::set1_epi32(i),
-        SIMD256T::set1_epi32(i)
-    };
+    return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)};
  }
  
  static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
  {
-    return Integer
-    {
-        SIMD256T::set1_epi8(i),
-        SIMD256T::set1_epi8(i)
-    };
+    return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)};
  }
  
-static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
  {
-    return Float
-    {
-        SIMD256T::set1_ps(f),
-        SIMD256T::set1_ps(f)
-    };
+    return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)};
  }
  
-static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
  {
-    return Float
-    {
-        SIMD256T::setzero_ps(),
-        SIMD256T::setzero_ps()
-    };
+    return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()};
  }
  
-static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
  {
-    return Integer
-    {
-        SIMD256T::setzero_si(),
-        SIMD256T::setzero_si()
-    };
+    return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()};
  }
  
-static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a)    // *p = a   (stores all elements contiguously in memory)
+static SIMDINLINE void SIMDCALL
+                       store_ps(float* p, Float const& a) // *p = a   (stores all elements contiguously in memory)
  {
      SIMD256T::store_ps(p, a.v8[0]);
      SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
  }
  
-static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a)   // *p = a
+static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
  {
      SIMD256T::store_si(&p->v8[0], a.v8[0]);
      SIMD256T::store_si(&p->v8[1], a.v8[1]);
  }
  
-static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+static SIMDINLINE void SIMDCALL
+                       stream_ps(float* p, Float const& a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
  {
      SIMD256T::stream_ps(p, a.v8[0]);
      SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
  }
  
-static SIMDINLINE Integer SIMDCALL set_epi32(
-    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
-    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
+                                             int i14,
+                                             int i13,
+                                             int i12,
+                                             int i11,
+                                             int i10,
+                                             int i9,
+                                             int i8,
+                                             int i7,
+                                             int i6,
+                                             int i5,
+                                             int i4,
+                                             int i3,
+                                             int i2,
+                                             int i1,
+                                             int i0)
  {
-    return Integer
-    {
-        SIMD256T::set_epi32(
-            i7, i6, i5, i4, i3, i2, i1, i0),
-        SIMD256T::set_epi32(
-            i15, i14, i13, i12, i11, i10, i9, i8)
-    };
+    return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0),
+                   SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)};
  }
  
-static SIMDINLINE Integer SIMDCALL set_epi32(
-    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+static SIMDINLINE Integer SIMDCALL
+                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
  {
-    return set_epi32(
-        0, 0, 0, 0, 0, 0, 0, 0,
-        i7, i6, i5, i4, i3, i2, i1, i0);
+    return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
  }
  
-static SIMDINLINE Float SIMDCALL set_ps(
-    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
-    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+static SIMDINLINE Float SIMDCALL set_ps(float i15,
+                                        float i14,
+                                        float i13,
+                                        float i12,
+                                        float i11,
+                                        float i10,
+                                        float i9,
+                                        float i8,
+                                        float i7,
+                                        float i6,
+                                        float i5,
+                                        float i4,
+                                        float i3,
+                                        float i2,
+                                        float i1,
+                                        float i0)
  {
-    return Float
-    {
-        SIMD256T::set_ps(
-            i7, i6, i5, i4, i3, i2, i1, i0),
-        SIMD256T::set_ps(
-            i15, i14, i13, i12, i11, i10, i9, i8)
-    };
+    return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0),
+                 SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)};
  }
  
-static SIMDINLINE Float SIMDCALL set_ps(
-    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+static SIMDINLINE Float SIMDCALL
+                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
  {
-    return set_ps(
-        0, 0, 0, 0, 0, 0, 0, 0,
-        i7, i6, i5, i4, i3, i2, i1, i0);
+    return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
  }
  
  static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
  {
-    return Float
-    {
-        SIMD256T::vmask_ps(mask),
-        SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)
-    };
+    return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)};
  }
  
  #undef SIMD_WRAPPER_1