f45b2e55880a28fa275336be0ef2c18adf2402c2
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / intrin.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23
24 #ifndef __SWR_INTRIN_H__
25 #define __SWR_INTRIN_H__
26
27 #include "os.h"
28
29 #include <cassert>
30
31 #include <emmintrin.h>
32 #include <immintrin.h>
33 #include <xmmintrin.h>
34
35 #if KNOB_SIMD_WIDTH == 8
36 typedef __m256 simdscalar;
37 typedef __m256i simdscalari;
38 typedef uint8_t simdmask;
39 #else
40 #error Unsupported vector width
41 #endif
42
43 // simd vector
44 OSALIGNSIMD(union) simdvector
45 {
46 simdscalar v[4];
47 struct
48 {
49 simdscalar x, y, z, w;
50 };
51
52 simdscalar& operator[] (const int i) { return v[i]; }
53 const simdscalar& operator[] (const int i) const { return v[i]; }
54 };
55
56 #if ENABLE_AVX512_SIMD16
57
58 #if KNOB_SIMD16_WIDTH == 16
59
60 #if ENABLE_AVX512_EMULATION
61 struct simd16scalar
62 {
63 __m256 lo;
64 __m256 hi;
65 };
66 struct simd16scalard
67 {
68 __m256d lo;
69 __m256d hi;
70 };
71 struct simd16scalari
72 {
73 __m256i lo;
74 __m256i hi;
75 };
76 typedef uint16_t simd16mask;
77
78 #else
79 typedef __m512 simd16scalar;
80 typedef __m512d simd16scalard;
81 typedef __m512i simd16scalari;
82 typedef __mmask16 simd16mask;
83 #endif//ENABLE_AVX512_EMULATION
84 #else
85 #error Unsupported vector width
86 #endif//KNOB_SIMD16_WIDTH == 16
87
88 #define _simd16_masklo(mask) ((mask) & 0xFF)
89 #define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
90 #define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
91
92 #if defined(_WIN32)
93 #define SIMDAPI __vectorcall
94 #else
95 #define SIMDAPI
96 #endif
97
98 OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
99 {
100 simd16scalar v[4];
101 struct
102 {
103 simd16scalar x, y, z, w;
104 };
105
106 simd16scalar& operator[] (const int i) { return v[i]; }
107 const simd16scalar& operator[] (const int i) const { return v[i]; }
108 };
109
110 #endif // ENABLE_AVX512_SIMD16
111
112 INLINE
113 UINT pdep_u32(UINT a, UINT mask)
114 {
115 #if KNOB_ARCH >= KNOB_ARCH_AVX2
116 return _pdep_u32(a, mask);
117 #else
118 UINT result = 0;
119
120 // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
121 // using bsf instead of funky loop
122 DWORD maskIndex;
123 while (_BitScanForward(&maskIndex, mask))
124 {
125 // 1. isolate lowest set bit of mask
126 const UINT lowest = 1 << maskIndex;
127
128 // 2. populate LSB from src
129 const UINT LSB = (UINT)((int)(a << 31) >> 31);
130
131 // 3. copy bit from mask
132 result |= LSB & lowest;
133
134 // 4. clear lowest bit
135 mask &= ~lowest;
136
137 // 5. prepare for next iteration
138 a >>= 1;
139 }
140
141 return result;
142 #endif
143 }
144
145 INLINE
146 UINT pext_u32(UINT a, UINT mask)
147 {
148 #if KNOB_ARCH >= KNOB_ARCH_AVX2
149 return _pext_u32(a, mask);
150 #else
151 UINT result = 0;
152 DWORD maskIndex;
153 uint32_t currentBit = 0;
154 while (_BitScanForward(&maskIndex, mask))
155 {
156 // 1. isolate lowest set bit of mask
157 const UINT lowest = 1 << maskIndex;
158
159 // 2. copy bit from mask
160 result |= ((a & lowest) > 0) << currentBit++;
161
162 // 3. clear lowest bit
163 mask &= ~lowest;
164 }
165 return result;
166 #endif
167 }
168
169 #endif//__SWR_INTRIN_H__