swr/rast: fix 32-bit compilation on Linux
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simdlib.hpp
1 /****************************************************************************
2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23 #pragma once
24
25 #include "simdlib_types.hpp"
26
27 // For documentation, please see the following include...
28 // #include "simdlib_interface.hpp"
29
30 namespace SIMDImpl
31 {
32 namespace SIMD128Impl
33 {
34 #if SIMD_ARCH >= SIMD_ARCH_AVX
35 struct AVXImpl
36 {
37 #define __SIMD_LIB_AVX_HPP__
38 #include "simdlib_128_avx.inl"
39 #undef __SIMD_LIB_AVX_HPP__
40 }; // struct AVXImpl
41 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
42
43 #if SIMD_ARCH >= SIMD_ARCH_AVX2
44 struct AVX2Impl : AVXImpl
45 {
46 #define __SIMD_LIB_AVX2_HPP__
47 #include "simdlib_128_avx2.inl"
48 #undef __SIMD_LIB_AVX2_HPP__
49 }; // struct AVX2Impl
50 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
51
52 #if SIMD_ARCH >= SIMD_ARCH_AVX512
53 struct AVX512Impl : AVX2Impl
54 {
55 #if defined(SIMD_OPT_128_AVX512)
56 #define __SIMD_LIB_AVX512_HPP__
57 #include "simdlib_128_avx512.inl"
58 #if defined(SIMD_ARCH_KNIGHTS)
59 #include "simdlib_128_avx512_knights.inl"
60 #else // optimize for core
61 #include "simdlib_128_avx512_core.inl"
62 #endif // defined(SIMD_ARCH_KNIGHTS)
63 #undef __SIMD_LIB_AVX512_HPP__
64 #endif // SIMD_OPT_128_AVX512
65 }; // struct AVX2Impl
66 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
67
68 struct Traits : SIMDImpl::Traits
69 {
70 #if SIMD_ARCH == SIMD_ARCH_AVX
71 using IsaImpl = AVXImpl;
72 #elif SIMD_ARCH == SIMD_ARCH_AVX2
73 using IsaImpl = AVX2Impl;
74 #elif SIMD_ARCH == SIMD_ARCH_AVX512
75 using IsaImpl = AVX512Impl;
76 #else
77 #error Invalid value for SIMD_ARCH
78 #endif
79
80 using Float = SIMD128Impl::Float;
81 using Double = SIMD128Impl::Double;
82 using Integer = SIMD128Impl::Integer;
83 using Vec4 = SIMD128Impl::Vec4;
84 using Mask = SIMD128Impl::Mask;
85 };
86 } // namespace SIMD128Impl
87
88 namespace SIMD256Impl
89 {
90 #if SIMD_ARCH >= SIMD_ARCH_AVX
91 struct AVXImpl
92 {
93 #define __SIMD_LIB_AVX_HPP__
94 #include "simdlib_256_avx.inl"
95 #undef __SIMD_LIB_AVX_HPP__
96 }; // struct AVXImpl
97 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
98
99 #if SIMD_ARCH >= SIMD_ARCH_AVX2
100 struct AVX2Impl : AVXImpl
101 {
102 #define __SIMD_LIB_AVX2_HPP__
103 #include "simdlib_256_avx2.inl"
104 #undef __SIMD_LIB_AVX2_HPP__
105 }; // struct AVX2Impl
106 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
107
108 #if SIMD_ARCH >= SIMD_ARCH_AVX512
109 struct AVX512Impl : AVX2Impl
110 {
111 #if defined(SIMD_OPT_256_AVX512)
112 #define __SIMD_LIB_AVX512_HPP__
113 #include "simdlib_256_avx512.inl"
114 #if defined(SIMD_ARCH_KNIGHTS)
115 #include "simdlib_256_avx512_knights.inl"
116 #else // optimize for core
117 #include "simdlib_256_avx512_core.inl"
118 #endif // defined(SIMD_ARCH_KNIGHTS)
119 #undef __SIMD_LIB_AVX512_HPP__
120 #endif // SIMD_OPT_256_AVX512
121 }; // struct AVX2Impl
122 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
123
124 struct Traits : SIMDImpl::Traits
125 {
126 #if SIMD_ARCH == SIMD_ARCH_AVX
127 using IsaImpl = AVXImpl;
128 #elif SIMD_ARCH == SIMD_ARCH_AVX2
129 using IsaImpl = AVX2Impl;
130 #elif SIMD_ARCH == SIMD_ARCH_AVX512
131 using IsaImpl = AVX512Impl;
132 #else
133 #error Invalid value for SIMD_ARCH
134 #endif
135
136 using Float = SIMD256Impl::Float;
137 using Double = SIMD256Impl::Double;
138 using Integer = SIMD256Impl::Integer;
139 using Vec4 = SIMD256Impl::Vec4;
140 using Mask = SIMD256Impl::Mask;
141 };
142 } // namespace SIMD256Impl
143
144 namespace SIMD512Impl
145 {
146 #if SIMD_ARCH >= SIMD_ARCH_AVX
147 template <typename SIMD256T>
148 struct AVXImplBase
149 {
150 #define __SIMD_LIB_AVX_HPP__
151 #include "simdlib_512_emu.inl"
152 #include "simdlib_512_emu_masks.inl"
153 #undef __SIMD_LIB_AVX_HPP__
154 }; // struct AVXImplBase
155 using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
156 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
157
158 #if SIMD_ARCH >= SIMD_ARCH_AVX2
159 using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
160 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
161
162 #if SIMD_ARCH >= SIMD_ARCH_AVX512
163 struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
164 {
165 #define __SIMD_LIB_AVX512_HPP__
166 #include "simdlib_512_avx512.inl"
167 #include "simdlib_512_avx512_masks.inl"
168 #if defined(SIMD_ARCH_KNIGHTS)
169 #include "simdlib_512_avx512_knights.inl"
170 #include "simdlib_512_avx512_masks_knights.inl"
171 #else // optimize for core
172 #include "simdlib_512_avx512_core.inl"
173 #include "simdlib_512_avx512_masks_core.inl"
174 #endif // defined(SIMD_ARCH_KNIGHTS)
175 #undef __SIMD_LIB_AVX512_HPP__
176 }; // struct AVX512ImplBase
177 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
178
179 struct Traits : SIMDImpl::Traits
180 {
181 #if SIMD_ARCH == SIMD_ARCH_AVX
182 using IsaImpl = AVXImpl;
183 #elif SIMD_ARCH == SIMD_ARCH_AVX2
184 using IsaImpl = AVX2Impl;
185 #elif SIMD_ARCH == SIMD_ARCH_AVX512
186 using IsaImpl = AVX512Impl;
187 #else
188 #error Invalid value for SIMD_ARCH
189 #endif
190
191 using Float = SIMD512Impl::Float;
192 using Double = SIMD512Impl::Double;
193 using Integer = SIMD512Impl::Integer;
194 using Vec4 = SIMD512Impl::Vec4;
195 using Mask = SIMD512Impl::Mask;
196 };
197 } // namespace SIMD512Impl
198 } // namespace SIMDImpl
199
200 template <typename Traits>
201 struct SIMDBase : Traits::IsaImpl
202 {
203 using CompareType = typename Traits::CompareType;
204 using ScaleFactor = typename Traits::ScaleFactor;
205 using RoundMode = typename Traits::RoundMode;
206 using SIMD = typename Traits::IsaImpl;
207 using Float = typename Traits::Float;
208 using Double = typename Traits::Double;
209 using Integer = typename Traits::Integer;
210 using Vec4 = typename Traits::Vec4;
211 using Mask = typename Traits::Mask;
212
213 static const size_t VECTOR_BYTES = sizeof(Float);
214
215 // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
216 static SIMDINLINE void vec4_load1_ps(Vec4& r, const float* p)
217 {
218 r[0] = SIMD::set1_ps(p[0]);
219 r[1] = SIMD::set1_ps(p[1]);
220 r[2] = SIMD::set1_ps(p[2]);
221 r[3] = SIMD::set1_ps(p[3]);
222 }
223
224 static SIMDINLINE void vec4_set1_vps(Vec4& r, Float const& s)
225 {
226 r[0] = s;
227 r[1] = s;
228 r[2] = s;
229 r[3] = s;
230 }
231
232 static SIMDINLINE Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
233 {
234 Float tmp, r;
235 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
236
237 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
238 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
239
240 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
241 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
242
243 return r;
244 }
245
246 static SIMDINLINE Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
247 {
248 Float tmp, r;
249 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
250
251 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
252 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
253
254 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
255 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
256
257 tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
258 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
259
260 return r;
261 }
262
263 static SIMDINLINE Float vec4_rcp_length_ps(const Vec4& v)
264 {
265 Float length = vec4_dp4_ps(v, v);
266 return SIMD::rsqrt_ps(length);
267 }
268
269 static SIMDINLINE void vec4_normalize_ps(Vec4& r, const Vec4& v)
270 {
271 Float rcpLength = vec4_rcp_length_ps(v);
272
273 r[0] = SIMD::mul_ps(v[0], rcpLength);
274 r[1] = SIMD::mul_ps(v[1], rcpLength);
275 r[2] = SIMD::mul_ps(v[2], rcpLength);
276 r[3] = SIMD::mul_ps(v[3], rcpLength);
277 }
278
279 static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v, Float const& s)
280 {
281 r[0] = SIMD::mul_ps(v[0], s);
282 r[1] = SIMD::mul_ps(v[1], s);
283 r[2] = SIMD::mul_ps(v[2], s);
284 r[3] = SIMD::mul_ps(v[3], s);
285 }
286
287 static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
288 {
289 r[0] = SIMD::mul_ps(v0[0], v1[0]);
290 r[1] = SIMD::mul_ps(v0[1], v1[1]);
291 r[2] = SIMD::mul_ps(v0[2], v1[2]);
292 r[3] = SIMD::mul_ps(v0[3], v1[3]);
293 }
294
295 static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, Float const& s)
296 {
297 r[0] = SIMD::add_ps(v0[0], s);
298 r[1] = SIMD::add_ps(v0[1], s);
299 r[2] = SIMD::add_ps(v0[2], s);
300 r[3] = SIMD::add_ps(v0[3], s);
301 }
302
303 static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
304 {
305 r[0] = SIMD::add_ps(v0[0], v1[0]);
306 r[1] = SIMD::add_ps(v0[1], v1[1]);
307 r[2] = SIMD::add_ps(v0[2], v1[2]);
308 r[3] = SIMD::add_ps(v0[3], v1[3]);
309 }
310
311 static SIMDINLINE void vec4_min_ps(Vec4& r, const Vec4& v0, Float const& s)
312 {
313 r[0] = SIMD::min_ps(v0[0], s);
314 r[1] = SIMD::min_ps(v0[1], s);
315 r[2] = SIMD::min_ps(v0[2], s);
316 r[3] = SIMD::min_ps(v0[3], s);
317 }
318
319 static SIMDINLINE void vec4_max_ps(Vec4& r, const Vec4& v0, Float const& s)
320 {
321 r[0] = SIMD::max_ps(v0[0], s);
322 r[1] = SIMD::max_ps(v0[1], s);
323 r[2] = SIMD::max_ps(v0[2], s);
324 r[3] = SIMD::max_ps(v0[3], s);
325 }
326
327 // Matrix4x4 * Vector4
328 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
329 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
330 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
331 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
332 static SIMDINLINE void SIMDCALL mat4x4_vec4_multiply(Vec4& result,
333 const float* pMatrix,
334 const Vec4& v)
335 {
336 Float m;
337 Float r0;
338 Float r1;
339
340 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
341 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
342 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
343 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
344 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
345 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
346 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
347 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
348 m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
349 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
350 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
351 result[0] = r0;
352
353 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
354 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
355 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
356 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
357 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
358 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
359 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
360 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
361 m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
362 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
363 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
364 result[1] = r0;
365
366 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
367 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
368 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
369 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
370 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
371 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
372 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
373 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
374 m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
375 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
376 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
377 result[2] = r0;
378
379 m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
380 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
381 m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
382 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
383 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
384 m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
385 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
386 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
387 m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
388 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
389 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
390 result[3] = r0;
391 }
392
393 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
394 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
395 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
396 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
397 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
398 static SIMDINLINE void SIMDCALL mat3x3_vec3_w0_multiply(Vec4& result,
399 const float* pMatrix,
400 const Vec4& v)
401 {
402 Float m;
403 Float r0;
404 Float r1;
405
406 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
407 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
408 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
409 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
410 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
411 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
412 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
413 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
414 result[0] = r0;
415
416 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
417 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
418 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
419 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
420 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
421 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
422 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
423 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
424 result[1] = r0;
425
426 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
427 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
428 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
429 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
430 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
431 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
432 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
433 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
434 result[2] = r0;
435
436 result[3] = SIMD::setzero_ps();
437 }
438
439 // Matrix4x4 * Vector3 - Position vector where w = 1.
440 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
441 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
442 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
443 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
444 static SIMDINLINE void SIMDCALL mat4x4_vec3_w1_multiply(Vec4& result,
445 const float* pMatrix,
446 const Vec4& v)
447 {
448 Float m;
449 Float r0;
450 Float r1;
451
452 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
453 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
454 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
455 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
456 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
457 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
458 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
459 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
460 m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
461 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
462 result[0] = r0;
463
464 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
465 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
466 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
467 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
468 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
469 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
470 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
471 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
472 m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
473 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
474 result[1] = r0;
475
476 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
477 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
478 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
479 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
480 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
481 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
482 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
483 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
484 m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
485 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
486 result[2] = r0;
487
488 m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
489 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
490 m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
491 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
492 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
493 m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
494 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
495 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
496 m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
497 result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
498 }
499
500 static SIMDINLINE void SIMDCALL mat4x3_vec3_w1_multiply(Vec4& result,
501 const float* pMatrix,
502 const Vec4& v)
503 {
504 Float m;
505 Float r0;
506 Float r1;
507
508 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
509 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
510 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
511 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
512 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
513 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
514 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
515 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
516 m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
517 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
518 result[0] = r0;
519
520 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
521 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
522 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
523 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
524 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
525 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
526 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
527 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
528 m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
529 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
530 result[1] = r0;
531
532 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
533 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
534 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
535 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
536 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
537 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
538 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
539 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
540 m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
541 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
542 result[2] = r0;
543 result[3] = SIMD::set1_ps(1.0f);
544 }
545 }; // struct SIMDBase
546
547 using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
548 using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
549 using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
550
551 template <typename SIMD_T>
552 using CompareType = typename SIMD_T::CompareType;
553 template <typename SIMD_T>
554 using ScaleFactor = typename SIMD_T::ScaleFactor;
555 template <typename SIMD_T>
556 using RoundMode = typename SIMD_T::RoundMode;
557 template <typename SIMD_T>
558 using Float = typename SIMD_T::Float;
559 template <typename SIMD_T>
560 using Double = typename SIMD_T::Double;
561 template <typename SIMD_T>
562 using Integer = typename SIMD_T::Integer;
563 template <typename SIMD_T>
564 using Vec4 = typename SIMD_T::Vec4;
565 template <typename SIMD_T>
566 using Mask = typename SIMD_T::Mask;
567