1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 ****************************************************************************/
24 #ifndef __SWR_SIMDINTRIN_H__
25 #define __SWR_SIMDINTRIN_H__
27 #include "common/intrin.h"
28 #include "common/simdlib.hpp"
30 #if KNOB_SIMD_WIDTH == 8
33 #error Unsupported vector width
34 #endif // KNOB_SIMD16_WIDTH == 16
36 #define _simd128_maskstore_ps SIMD128::maskstore_ps
37 #define _simd128_fmadd_ps SIMD128::fmadd_ps
39 #define _simd_load_ps SIMD::load_ps
40 #define _simd_load1_ps SIMD::broadcast_ss
41 #define _simd_loadu_ps SIMD::loadu_ps
42 #define _simd_setzero_ps SIMD::setzero_ps
43 #define _simd_set1_ps SIMD::set1_ps
44 #define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b)
45 #define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b)
46 #define _simd_blendv_ps SIMD::blendv_ps
47 #define _simd_store_ps SIMD::store_ps
48 #define _simd_mul_ps SIMD::mul_ps
49 #define _simd_add_ps SIMD::add_ps
50 #define _simd_sub_ps SIMD::sub_ps
51 #define _simd_rsqrt_ps SIMD::rsqrt_ps
52 #define _simd_min_ps SIMD::min_ps
53 #define _simd_max_ps SIMD::max_ps
54 #define _simd_movemask_ps SIMD::movemask_ps
55 #define _simd_cvtps_epi32 SIMD::cvtps_epi32
56 #define _simd_cvttps_epi32 SIMD::cvttps_epi32
57 #define _simd_cvtepi32_ps SIMD::cvtepi32_ps
58 #define _simd_cmplt_ps SIMD::cmplt_ps
59 #define _simd_cmpgt_ps SIMD::cmpgt_ps
60 #define _simd_cmpneq_ps SIMD::cmpneq_ps
61 #define _simd_cmpeq_ps SIMD::cmpeq_ps
62 #define _simd_cmpge_ps SIMD::cmpge_ps
63 #define _simd_cmple_ps SIMD::cmple_ps
64 #define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
65 #define _simd_and_ps SIMD::and_ps
66 #define _simd_or_ps SIMD::or_ps
67 #define _simd_rcp_ps SIMD::rcp_ps
68 #define _simd_div_ps SIMD::div_ps
69 #define _simd_castsi_ps SIMD::castsi_ps
70 #define _simd_castps_pd SIMD::castps_pd
71 #define _simd_castpd_ps SIMD::castpd_ps
72 #define _simd_andnot_ps SIMD::andnot_ps
73 #define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a)
74 #define _simd_castpd_ps SIMD::castpd_ps
75 #define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const*)(a))
76 #define _simd_stream_ps SIMD::stream_ps
78 #define _simd_movemask_pd SIMD::movemask_pd
79 #define _simd_castsi_pd SIMD::castsi_pd
81 #define _simd_mul_epi32 SIMD::mul_epi32
82 #define _simd_mullo_epi32 SIMD::mullo_epi32
83 #define _simd_sub_epi32 SIMD::sub_epi32
84 #define _simd_sub_epi64 SIMD::sub_epi64
85 #define _simd_min_epi32 SIMD::min_epi32
86 #define _simd_min_epu32 SIMD::min_epu32
87 #define _simd_max_epi32 SIMD::max_epi32
88 #define _simd_max_epu32 SIMD::max_epu32
89 #define _simd_add_epi32 SIMD::add_epi32
90 #define _simd_and_si SIMD::and_si
91 #define _simd_andnot_si SIMD::andnot_si
92 #define _simd_cmpeq_epi32 SIMD::cmpeq_epi32
93 #define _simd_cmplt_epi32 SIMD::cmplt_epi32
94 #define _simd_cmpgt_epi32 SIMD::cmpgt_epi32
95 #define _simd_or_si SIMD::or_si
96 #define _simd_xor_si SIMD::xor_si
97 #define _simd_castps_si SIMD::castps_si
98 #define _simd_adds_epu8 SIMD::adds_epu8
99 #define _simd_subs_epu8 SIMD::subs_epu8
100 #define _simd_add_epi8 SIMD::add_epi8
101 #define _simd_cmpeq_epi64 SIMD::cmpeq_epi64
102 #define _simd_cmpgt_epi64 SIMD::cmpgt_epi64
103 #define _simd_cmpgt_epi8 SIMD::cmpgt_epi8
104 #define _simd_cmpeq_epi8 SIMD::cmpeq_epi8
105 #define _simd_cmpgt_epi16 SIMD::cmpgt_epi16
106 #define _simd_cmpeq_epi16 SIMD::cmpeq_epi16
107 #define _simd_movemask_epi8 SIMD::movemask_epi8
108 #define _simd_permute_ps_i(a, i) SIMD::permute_ps<i>(a)
109 #define _simd_permute_ps SIMD::permute_ps
110 #define _simd_permute_epi32 SIMD::permute_epi32
111 #define _simd_srlv_epi32 SIMD::srlv_epi32
112 #define _simd_sllv_epi32 SIMD::sllv_epi32
114 #define _simd_unpacklo_epi8 SIMD::unpacklo_epi8
115 #define _simd_unpackhi_epi8 SIMD::unpackhi_epi8
116 #define _simd_unpacklo_epi16 SIMD::unpacklo_epi16
117 #define _simd_unpackhi_epi16 SIMD::unpackhi_epi16
118 #define _simd_unpacklo_epi32 SIMD::unpacklo_epi32
119 #define _simd_unpackhi_epi32 SIMD::unpackhi_epi32
120 #define _simd_unpacklo_epi64 SIMD::unpacklo_epi64
121 #define _simd_unpackhi_epi64 SIMD::unpackhi_epi64
123 #define _simd_slli_epi32(a, i) SIMD::slli_epi32<i>(a)
124 #define _simd_srai_epi32(a, i) SIMD::srai_epi32<i>(a)
125 #define _simd_srli_epi32(a, i) SIMD::srli_epi32<i>(a)
126 #define _simd_srlisi_ps(a, i) SIMD::srlisi_ps<i>(a)
128 #define _simd_fmadd_ps SIMD::fmadd_ps
129 #define _simd_fmsub_ps SIMD::fmsub_ps
130 #define _simd_shuffle_epi8 SIMD::shuffle_epi8
132 #define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
133 #define _simd_mask_i32gather_ps(r, p, o, m, s) \
134 SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
135 #define _simd_abs_epi32 SIMD::abs_epi32
137 #define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16
138 #define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32
139 #define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32
140 #define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64
141 #define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64
143 #define _simd_packus_epi16 SIMD::packus_epi16
144 #define _simd_packs_epi16 SIMD::packs_epi16
145 #define _simd_packus_epi32 SIMD::packus_epi32
146 #define _simd_packs_epi32 SIMD::packs_epi32
148 #define _simd_unpacklo_ps SIMD::unpacklo_ps
149 #define _simd_unpackhi_ps SIMD::unpackhi_ps
150 #define _simd_unpacklo_pd SIMD::unpacklo_pd
151 #define _simd_unpackhi_pd SIMD::unpackhi_pd
152 #define _simd_insertf128_ps SIMD::insertf128_ps
153 #define _simd_insertf128_pd SIMD::insertf128_pd
154 #define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b)
155 #define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a)
156 #define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a)
157 #define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a)
158 #define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b)
159 #define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b)
160 #define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b)
161 #define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b)
162 #define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b)
163 #define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b)
164 #define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b)
165 #define _simd_set1_epi32 SIMD::set1_epi32
166 #define _simd_set_epi32 SIMD::set_epi32
167 #define _simd_set_ps SIMD::set_ps
168 #define _simd_set1_epi8 SIMD::set1_epi8
169 #define _simd_setzero_si SIMD::setzero_si
170 #define _simd_cvttps_epi32 SIMD::cvttps_epi32
171 #define _simd_store_si SIMD::store_si
172 #define _simd_broadcast_ss SIMD::broadcast_ss
173 #define _simd_maskstore_ps SIMD::maskstore_ps
174 #define _simd_load_si SIMD::load_si
175 #define _simd_loadu_si SIMD::loadu_si
176 #define _simd_sub_ps SIMD::sub_ps
177 #define _simd_testz_ps SIMD::testz_ps
178 #define _simd_testz_si SIMD::testz_si
179 #define _simd_xor_ps SIMD::xor_ps
181 #define _simd_loadu2_si SIMD::loadu2_si
182 #define _simd_storeu2_si SIMD::storeu2_si
184 #define _simd_blendv_epi32 SIMD::blendv_epi32
185 #define _simd_vmask_ps SIMD::vmask_ps
188 SIMDINLINE
SIMD128::Integer
_simd_blend4_epi32(SIMD128::Integer
const& a
, SIMD128::Integer
const& b
)
190 return SIMD128::castps_si(
191 SIMD128::blend_ps
<mask
>(SIMD128::castsi_ps(a
), SIMD128::castsi_ps(b
)));
194 //////////////////////////////////////////////////////////////////////////
195 /// @brief Compute plane equation vA * vX + vB * vY + vC
196 SIMDINLINE simdscalar
vplaneps(simdscalar
const& vA
,
197 simdscalar
const& vB
,
198 simdscalar
const& vC
,
199 simdscalar
const& vX
,
200 simdscalar
const& vY
)
202 simdscalar vOut
= _simd_fmadd_ps(vA
, vX
, vC
);
203 vOut
= _simd_fmadd_ps(vB
, vY
, vOut
);
207 //////////////////////////////////////////////////////////////////////////
208 /// @brief Compute plane equation vA * vX + vB * vY + vC
209 SIMDINLINE simd4scalar
vplaneps(simd4scalar
const& vA
,
210 simd4scalar
const& vB
,
211 simd4scalar
const& vC
,
212 simd4scalar
const& vX
,
213 simd4scalar
const& vY
)
215 simd4scalar vOut
= _simd128_fmadd_ps(vA
, vX
, vC
);
216 vOut
= _simd128_fmadd_ps(vB
, vY
, vOut
);
220 //////////////////////////////////////////////////////////////////////////
221 /// @brief Interpolates a single component.
222 /// @param vI - barycentric I
223 /// @param vJ - barycentric J
224 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
225 template <UINT Attrib
, UINT Comp
, UINT numComponents
= 4>
226 static SIMDINLINE simdscalar
InterpolateComponent(simdscalar
const& vI
,
227 simdscalar
const& vJ
,
228 const float* pInterpBuffer
)
230 const float* pInterpA
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ 0 + Comp
];
231 const float* pInterpB
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
+ Comp
];
232 const float* pInterpC
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
* 2 + Comp
];
234 if ((pInterpA
[0] == pInterpB
[0]) && (pInterpA
[0] == pInterpC
[0]))
236 // Ensure constant attribs are constant. Required for proper
237 // 3D resource copies.
238 return _simd_broadcast_ss(pInterpA
);
241 simdscalar vA
= _simd_broadcast_ss(pInterpA
);
242 simdscalar vB
= _simd_broadcast_ss(pInterpB
);
243 simdscalar vC
= _simd_broadcast_ss(pInterpC
);
245 simdscalar vk
= _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f
), vI
), vJ
);
246 vC
= _simd_mul_ps(vk
, vC
);
248 return vplaneps(vA
, vB
, vC
, vI
, vJ
);
251 //////////////////////////////////////////////////////////////////////////
252 /// @brief Interpolates a single component (flat shade).
253 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
254 template <UINT Attrib
, UINT Comp
, UINT numComponents
= 4>
255 static SIMDINLINE simdscalar
InterpolateComponentFlat(const float* pInterpBuffer
)
257 const float* pInterpA
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ 0 + Comp
];
259 simdscalar vA
= _simd_broadcast_ss(pInterpA
);
264 //////////////////////////////////////////////////////////////////////////
265 /// @brief Interpolates a single component (flat shade).
266 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
267 template <UINT Attrib
, UINT Comp
, UINT numComponents
= 4>
268 static SIMDINLINE simdscalari
InterpolateComponentFlatInt(const uint32_t* pInterpBuffer
)
270 const uint32_t interpA
= pInterpBuffer
[Attrib
* 3 * numComponents
+ 0 + Comp
];
272 simdscalari vA
= _simd_set1_epi32(interpA
);
277 //////////////////////////////////////////////////////////////////////////
278 /// @brief Interpolates a single component.
279 /// @param vI - barycentric I
280 /// @param vJ - barycentric J
281 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
282 template <UINT Attrib
, UINT Comp
, UINT numComponents
= 4>
283 static SIMDINLINE simd4scalar
InterpolateComponent(simd4scalar
const& vI
,
284 simd4scalar
const& vJ
,
285 const float* pInterpBuffer
)
287 const float* pInterpA
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ 0 + Comp
];
288 const float* pInterpB
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
+ Comp
];
289 const float* pInterpC
= &pInterpBuffer
[Attrib
* 3 * numComponents
+ numComponents
* 2 + Comp
];
291 if ((pInterpA
[0] == pInterpB
[0]) && (pInterpA
[0] == pInterpC
[0]))
293 // Ensure constant attribs are constant. Required for proper
294 // 3D resource copies.
295 return SIMD128::broadcast_ss(pInterpA
);
298 simd4scalar vA
= SIMD128::broadcast_ss(pInterpA
);
299 simd4scalar vB
= SIMD128::broadcast_ss(pInterpB
);
300 simd4scalar vC
= SIMD128::broadcast_ss(pInterpC
);
302 simd4scalar vk
= SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f
), vI
), vJ
);
303 vC
= SIMD128::mul_ps(vk
, vC
);
305 return vplaneps(vA
, vB
, vC
, vI
, vJ
);
308 static SIMDINLINE simd4scalar
_simd128_abs_ps(simd4scalar
const& a
)
310 simd4scalari ai
= SIMD128::castps_si(a
);
311 return SIMD128::castsi_ps(SIMD128::and_si(ai
, SIMD128::set1_epi32(0x7fffffff)));
314 static SIMDINLINE simdscalar
_simd_abs_ps(simdscalar
const& a
)
316 simdscalari ai
= _simd_castps_si(a
);
317 return _simd_castsi_ps(_simd_and_si(ai
, _simd_set1_epi32(0x7fffffff)));
320 #include "simd16intrin.h"
322 #endif //__SWR_SIMDINTRIN_H__