src/gallium/drivers/swr/rasterizer/common/intrin.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 ****************************************************************************/
  23
  24 #ifndef __SWR_INTRIN_H__
  25 #define __SWR_INTRIN_H__
  26
  27 #include "os.h"
  28
  29 #include <cassert>
  30
  31 #include <emmintrin.h>
  32 #include <immintrin.h>
  33 #include <xmmintrin.h>
  34
  35 #if KNOB_SIMD_WIDTH == 8
  36 typedef __m256 simdscalar;
  37 typedef __m256i simdscalari;
  38 typedef uint8_t simdmask;
  39 #else
  40 #error Unsupported vector width
  41 #endif
  42
  43 // simd vector
  44 OSALIGNSIMD(union) simdvector
  45 {
  46     simdscalar  v[4];
  47     struct
  48     {
  49         simdscalar x, y, z, w;
  50     };
  51
  52     simdscalar& operator[] (const int i) { return v[i]; }
  53     const simdscalar& operator[] (const int i) const { return v[i]; }
  54 };
  55
  56 #if ENABLE_AVX512_SIMD16
  57
  58 #if KNOB_SIMD16_WIDTH == 16
  59
  60 #if ENABLE_AVX512_EMULATION
  61 struct simd16scalar
  62 {
  63     __m256  lo;
  64     __m256  hi;
  65 };
  66 struct simd16scalard
  67 {
  68     __m256d lo;
  69     __m256d hi;
  70 };
  71 struct simd16scalari
  72 {
  73     __m256i lo;
  74     __m256i hi;
  75 };
  76 typedef uint16_t simd16mask;
  77
  78 #else
  79 typedef __m512 simd16scalar;
  80 typedef __m512d simd16scalard;
  81 typedef __m512i simd16scalari;
  82 typedef __mmask16 simd16mask;
  83 #endif//ENABLE_AVX512_EMULATION
  84 #else
  85 #error Unsupported vector width
  86 #endif//KNOB_SIMD16_WIDTH == 16
  87
  88 #define _simd16_masklo(mask) ((mask) & 0xFF)
  89 #define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
  90 #define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
  91
  92 #if defined(_WIN32)
  93 #define SIMDAPI __vectorcall
  94 #else
  95 #define SIMDAPI
  96 #endif
  97
  98 OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
  99 {
 100     simd16scalar  v[4];
 101     struct
 102     {
 103         simd16scalar x, y, z, w;
 104     };
 105
 106     simd16scalar& operator[] (const int i) { return v[i]; }
 107     const simd16scalar& operator[] (const int i) const { return v[i]; }
 108 };
 109
 110 #endif // ENABLE_AVX512_SIMD16
 111
 112 INLINE
 113 UINT pdep_u32(UINT a, UINT mask)
 114 {
 115 #if KNOB_ARCH >= KNOB_ARCH_AVX2
 116     return _pdep_u32(a, mask);
 117 #else
 118     UINT result = 0;
 119
 120     // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
 121     // using bsf instead of funky loop
 122     DWORD maskIndex;
 123     while (_BitScanForward(&maskIndex, mask))
 124     {
 125         // 1. isolate lowest set bit of mask
 126         const UINT lowest = 1 << maskIndex;
 127
 128         // 2. populate LSB from src
 129         const UINT LSB = (UINT)((int)(a << 31) >> 31);
 130
 131         // 3. copy bit from mask
 132         result |= LSB & lowest;
 133
 134         // 4. clear lowest bit
 135         mask &= ~lowest;
 136
 137         // 5. prepare for next iteration
 138         a >>= 1;
 139     }
 140
 141     return result;
 142 #endif
 143 }
 144
 145 INLINE
 146 UINT pext_u32(UINT a, UINT mask)
 147 {
 148 #if KNOB_ARCH >= KNOB_ARCH_AVX2
 149     return _pext_u32(a, mask);
 150 #else
 151     UINT result = 0;
 152     DWORD maskIndex;
 153     uint32_t currentBit = 0;
 154     while (_BitScanForward(&maskIndex, mask))
 155     {
 156         // 1. isolate lowest set bit of mask
 157         const UINT lowest = 1 << maskIndex;
 158
 159         // 2. copy bit from mask
 160         result |= ((a & lowest) > 0) << currentBit++;
 161
 162         // 3. clear lowest bit
 163         mask &= ~lowest;
 164     }
 165     return result;
 166 #endif
 167 }
 168
 169 #endif//__SWR_INTRIN_H__