src/gallium/drivers/swr/rasterizer/common/intrin.h

   1 /****************************************************************************
   2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  ****************************************************************************/
  23
  24 #ifndef __SWR_INTRIN_H__
  25 #define __SWR_INTRIN_H__
  26
  27 #include "os.h"
  28
  29 #if !defined(SIMD_ARCH)
  30 #define SIMD_ARCH KNOB_ARCH
  31 #endif
  32
  33 #include "simdlib_types.hpp"
  34
  35 typedef SIMDImpl::SIMD128Impl::Float   simd4scalar;
  36 typedef SIMDImpl::SIMD128Impl::Double  simd4scalard;
  37 typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
  38 typedef SIMDImpl::SIMD128Impl::Vec4    simd4vector;
  39 typedef SIMDImpl::SIMD128Impl::Mask    simd4mask;
  40
  41 typedef SIMDImpl::SIMD256Impl::Float   simd8scalar;
  42 typedef SIMDImpl::SIMD256Impl::Double  simd8scalard;
  43 typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
  44 typedef SIMDImpl::SIMD256Impl::Vec4    simd8vector;
  45 typedef SIMDImpl::SIMD256Impl::Mask    simd8mask;
  46
  47 typedef SIMDImpl::SIMD512Impl::Float   simd16scalar;
  48 typedef SIMDImpl::SIMD512Impl::Double  simd16scalard;
  49 typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
  50 typedef SIMDImpl::SIMD512Impl::Vec4    simd16vector;
  51 typedef SIMDImpl::SIMD512Impl::Mask    simd16mask;
  52
  53 #if KNOB_SIMD_WIDTH == 8
  54 typedef simd8scalar  simdscalar;
  55 typedef simd8scalard simdscalard;
  56 typedef simd8scalari simdscalari;
  57 typedef simd8vector  simdvector;
  58 typedef simd8mask    simdmask;
  59 #else
  60 #error Unsupported vector width
  61 #endif
  62
  63 INLINE
  64 UINT pdep_u32(UINT a, UINT mask)
  65 {
  66 #if KNOB_ARCH >= KNOB_ARCH_AVX2
  67     return _pdep_u32(a, mask);
  68 #else
  69     UINT result = 0;
  70
  71     // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
  72     // using bsf instead of funky loop
  73     DWORD maskIndex;
  74     while (_BitScanForward(&maskIndex, mask))
  75     {
  76         // 1. isolate lowest set bit of mask
  77         const UINT lowest = 1 << maskIndex;
  78
  79         // 2. populate LSB from src
  80         const UINT LSB = (UINT)((int)(a << 31) >> 31);
  81
  82         // 3. copy bit from mask
  83         result |= LSB & lowest;
  84
  85         // 4. clear lowest bit
  86         mask &= ~lowest;
  87
  88         // 5. prepare for next iteration
  89         a >>= 1;
  90     }
  91
  92     return result;
  93 #endif
  94 }
  95
  96 INLINE
  97 UINT pext_u32(UINT a, UINT mask)
  98 {
  99 #if KNOB_ARCH >= KNOB_ARCH_AVX2
 100     return _pext_u32(a, mask);
 101 #else
 102     UINT     result = 0;
 103     DWORD    maskIndex;
 104     uint32_t currentBit = 0;
 105     while (_BitScanForward(&maskIndex, mask))
 106     {
 107         // 1. isolate lowest set bit of mask
 108         const UINT lowest = 1 << maskIndex;
 109
 110         // 2. copy bit from mask
 111         result |= ((a & lowest) > 0) << currentBit++;
 112
 113         // 3. clear lowest bit
 114         mask &= ~lowest;
 115     }
 116     return result;
 117 #endif
 118 }
 119
 120 #endif //__SWR_INTRIN_H__