1 /****************************************************************************
2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 ****************************************************************************/
25 #include "simdlib_types.hpp"
27 // For documentation, please see the following include...
28 // #include "simdlib_interface.hpp"
34 #if SIMD_ARCH >= SIMD_ARCH_AVX
37 #define __SIMD_LIB_AVX_HPP__
38 #include "simdlib_128_avx.inl"
39 #undef __SIMD_LIB_AVX_HPP__
41 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
43 #if SIMD_ARCH >= SIMD_ARCH_AVX2
44 struct AVX2Impl : AVXImpl
46 #define __SIMD_LIB_AVX2_HPP__
47 #include "simdlib_128_avx2.inl"
48 #undef __SIMD_LIB_AVX2_HPP__
50 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
52 #if SIMD_ARCH >= SIMD_ARCH_AVX512
53 struct AVX512Impl : AVX2Impl
55 #if defined(SIMD_OPT_128_AVX512)
56 #define __SIMD_LIB_AVX512_HPP__
57 #include "simdlib_128_avx512.inl"
58 #if defined(SIMD_ARCH_KNIGHTS)
59 #include "simdlib_128_avx512_knights.inl"
60 #else // optimize for core
61 #include "simdlib_128_avx512_core.inl"
62 #endif // defined(SIMD_ARCH_KNIGHTS)
63 #undef __SIMD_LIB_AVX512_HPP__
64 #endif // SIMD_OPT_128_AVX512
66 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
68 struct Traits : SIMDImpl::Traits
70 #if SIMD_ARCH == SIMD_ARCH_AVX
71 using IsaImpl = AVXImpl;
72 #elif SIMD_ARCH == SIMD_ARCH_AVX2
73 using IsaImpl = AVX2Impl;
74 #elif SIMD_ARCH == SIMD_ARCH_AVX512
75 using IsaImpl = AVX512Impl;
77 #error Invalid value for SIMD_ARCH
80 using Float = SIMD128Impl::Float;
81 using Double = SIMD128Impl::Double;
82 using Integer = SIMD128Impl::Integer;
83 using Vec4 = SIMD128Impl::Vec4;
84 using Mask = SIMD128Impl::Mask;
86 } // namespace SIMD128Impl
90 #if SIMD_ARCH >= SIMD_ARCH_AVX
93 #define __SIMD_LIB_AVX_HPP__
94 #include "simdlib_256_avx.inl"
95 #undef __SIMD_LIB_AVX_HPP__
97 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
99 #if SIMD_ARCH >= SIMD_ARCH_AVX2
100 struct AVX2Impl : AVXImpl
102 #define __SIMD_LIB_AVX2_HPP__
103 #include "simdlib_256_avx2.inl"
104 #undef __SIMD_LIB_AVX2_HPP__
105 }; // struct AVX2Impl
106 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
108 #if SIMD_ARCH >= SIMD_ARCH_AVX512
109 struct AVX512Impl : AVX2Impl
111 #if defined(SIMD_OPT_256_AVX512)
112 #define __SIMD_LIB_AVX512_HPP__
113 #include "simdlib_256_avx512.inl"
114 #if defined(SIMD_ARCH_KNIGHTS)
115 #include "simdlib_256_avx512_knights.inl"
116 #else // optimize for core
117 #include "simdlib_256_avx512_core.inl"
118 #endif // defined(SIMD_ARCH_KNIGHTS)
119 #undef __SIMD_LIB_AVX512_HPP__
120 #endif // SIMD_OPT_256_AVX512
121 }; // struct AVX2Impl
122 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
124 struct Traits : SIMDImpl::Traits
126 #if SIMD_ARCH == SIMD_ARCH_AVX
127 using IsaImpl = AVXImpl;
128 #elif SIMD_ARCH == SIMD_ARCH_AVX2
129 using IsaImpl = AVX2Impl;
130 #elif SIMD_ARCH == SIMD_ARCH_AVX512
131 using IsaImpl = AVX512Impl;
133 #error Invalid value for SIMD_ARCH
136 using Float = SIMD256Impl::Float;
137 using Double = SIMD256Impl::Double;
138 using Integer = SIMD256Impl::Integer;
139 using Vec4 = SIMD256Impl::Vec4;
140 using Mask = SIMD256Impl::Mask;
142 } // namespace SIMD256Impl
144 namespace SIMD512Impl
146 #if SIMD_ARCH >= SIMD_ARCH_AVX
147 template <typename SIMD256T>
150 #define __SIMD_LIB_AVX_HPP__
151 #include "simdlib_512_emu.inl"
152 #include "simdlib_512_emu_masks.inl"
153 #undef __SIMD_LIB_AVX_HPP__
154 }; // struct AVXImplBase
155 using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
156 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
158 #if SIMD_ARCH >= SIMD_ARCH_AVX2
159 using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
160 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
162 #if SIMD_ARCH >= SIMD_ARCH_AVX512
163 struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
165 #define __SIMD_LIB_AVX512_HPP__
166 #include "simdlib_512_avx512.inl"
167 #include "simdlib_512_avx512_masks.inl"
168 #if defined(SIMD_ARCH_KNIGHTS)
169 #include "simdlib_512_avx512_knights.inl"
170 #include "simdlib_512_avx512_masks_knights.inl"
171 #else // optimize for core
172 #include "simdlib_512_avx512_core.inl"
173 #include "simdlib_512_avx512_masks_core.inl"
174 #endif // defined(SIMD_ARCH_KNIGHTS)
175 #undef __SIMD_LIB_AVX512_HPP__
176 }; // struct AVX512ImplBase
177 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
179 struct Traits : SIMDImpl::Traits
181 #if SIMD_ARCH == SIMD_ARCH_AVX
182 using IsaImpl = AVXImpl;
183 #elif SIMD_ARCH == SIMD_ARCH_AVX2
184 using IsaImpl = AVX2Impl;
185 #elif SIMD_ARCH == SIMD_ARCH_AVX512
186 using IsaImpl = AVX512Impl;
188 #error Invalid value for SIMD_ARCH
191 using Float = SIMD512Impl::Float;
192 using Double = SIMD512Impl::Double;
193 using Integer = SIMD512Impl::Integer;
194 using Vec4 = SIMD512Impl::Vec4;
195 using Mask = SIMD512Impl::Mask;
197 } // namespace SIMD512Impl
198 } // namespace SIMDImpl
200 template <typename Traits>
201 struct SIMDBase : Traits::IsaImpl
203 using CompareType = typename Traits::CompareType;
204 using ScaleFactor = typename Traits::ScaleFactor;
205 using RoundMode = typename Traits::RoundMode;
206 using SIMD = typename Traits::IsaImpl;
207 using Float = typename Traits::Float;
208 using Double = typename Traits::Double;
209 using Integer = typename Traits::Integer;
210 using Vec4 = typename Traits::Vec4;
211 using Mask = typename Traits::Mask;
213 static const size_t VECTOR_BYTES = sizeof(Float);
215 // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
216 static SIMDINLINE void vec4_load1_ps(Vec4& r, const float* p)
218 r[0] = SIMD::set1_ps(p[0]);
219 r[1] = SIMD::set1_ps(p[1]);
220 r[2] = SIMD::set1_ps(p[2]);
221 r[3] = SIMD::set1_ps(p[3]);
224 static SIMDINLINE void vec4_set1_vps(Vec4& r, Float const& s)
232 static SIMDINLINE Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
235 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
237 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
238 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
240 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
241 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
246 static SIMDINLINE Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
249 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
251 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
252 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
254 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
255 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
257 tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
258 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
263 static SIMDINLINE Float vec4_rcp_length_ps(const Vec4& v)
265 Float length = vec4_dp4_ps(v, v);
266 return SIMD::rsqrt_ps(length);
269 static SIMDINLINE void vec4_normalize_ps(Vec4& r, const Vec4& v)
271 Float rcpLength = vec4_rcp_length_ps(v);
273 r[0] = SIMD::mul_ps(v[0], rcpLength);
274 r[1] = SIMD::mul_ps(v[1], rcpLength);
275 r[2] = SIMD::mul_ps(v[2], rcpLength);
276 r[3] = SIMD::mul_ps(v[3], rcpLength);
279 static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v, Float const& s)
281 r[0] = SIMD::mul_ps(v[0], s);
282 r[1] = SIMD::mul_ps(v[1], s);
283 r[2] = SIMD::mul_ps(v[2], s);
284 r[3] = SIMD::mul_ps(v[3], s);
287 static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
289 r[0] = SIMD::mul_ps(v0[0], v1[0]);
290 r[1] = SIMD::mul_ps(v0[1], v1[1]);
291 r[2] = SIMD::mul_ps(v0[2], v1[2]);
292 r[3] = SIMD::mul_ps(v0[3], v1[3]);
295 static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, Float const& s)
297 r[0] = SIMD::add_ps(v0[0], s);
298 r[1] = SIMD::add_ps(v0[1], s);
299 r[2] = SIMD::add_ps(v0[2], s);
300 r[3] = SIMD::add_ps(v0[3], s);
303 static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
305 r[0] = SIMD::add_ps(v0[0], v1[0]);
306 r[1] = SIMD::add_ps(v0[1], v1[1]);
307 r[2] = SIMD::add_ps(v0[2], v1[2]);
308 r[3] = SIMD::add_ps(v0[3], v1[3]);
311 static SIMDINLINE void vec4_min_ps(Vec4& r, const Vec4& v0, Float const& s)
313 r[0] = SIMD::min_ps(v0[0], s);
314 r[1] = SIMD::min_ps(v0[1], s);
315 r[2] = SIMD::min_ps(v0[2], s);
316 r[3] = SIMD::min_ps(v0[3], s);
319 static SIMDINLINE void vec4_max_ps(Vec4& r, const Vec4& v0, Float const& s)
321 r[0] = SIMD::max_ps(v0[0], s);
322 r[1] = SIMD::max_ps(v0[1], s);
323 r[2] = SIMD::max_ps(v0[2], s);
324 r[3] = SIMD::max_ps(v0[3], s);
327 // Matrix4x4 * Vector4
328 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
329 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
330 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
331 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
332 static SIMDINLINE void SIMDCALL mat4x4_vec4_multiply(Vec4& result,
333 const float* pMatrix,
340 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
341 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
342 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
343 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
344 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
345 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
346 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
347 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
348 m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
349 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
350 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
353 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
354 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
355 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
356 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
357 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
358 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
359 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
360 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
361 m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
362 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
363 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
366 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
367 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
368 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
369 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
370 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
371 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
372 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
373 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
374 m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
375 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
376 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
379 m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
380 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
381 m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
382 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
383 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
384 m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
385 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
386 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
387 m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
388 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
389 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
393 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
394 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
395 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
396 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
397 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
398 static SIMDINLINE void SIMDCALL mat3x3_vec3_w0_multiply(Vec4& result,
399 const float* pMatrix,
406 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
407 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
408 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
409 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
410 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
411 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
412 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
413 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
416 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
417 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
418 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
419 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
420 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
421 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
422 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
423 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
426 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
427 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
428 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
429 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
430 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
431 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
432 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
433 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
436 result[3] = SIMD::setzero_ps();
439 // Matrix4x4 * Vector3 - Position vector where w = 1.
440 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
441 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
442 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
443 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
444 static SIMDINLINE void SIMDCALL mat4x4_vec3_w1_multiply(Vec4& result,
445 const float* pMatrix,
452 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
453 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
454 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
455 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
456 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
457 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
458 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
459 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
460 m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
461 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
464 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
465 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
466 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
467 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
468 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
469 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
470 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
471 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
472 m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
473 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
476 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
477 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
478 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
479 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
480 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
481 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
482 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
483 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
484 m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
485 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
488 m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
489 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
490 m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
491 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
492 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
493 m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
494 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
495 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
496 m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
497 result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
500 static SIMDINLINE void SIMDCALL mat4x3_vec3_w1_multiply(Vec4& result,
501 const float* pMatrix,
508 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
509 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
510 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
511 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
512 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
513 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
514 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
515 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
516 m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
517 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
520 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
521 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
522 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
523 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
524 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
525 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
526 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
527 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
528 m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
529 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
532 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
533 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
534 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
535 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
536 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
537 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
538 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
539 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
540 m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
541 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
543 result[3] = SIMD::set1_ps(1.0f);
545 }; // struct SIMDBase
547 using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
548 using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
549 using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
551 template <typename SIMD_T>
552 using CompareType = typename SIMD_T::CompareType;
553 template <typename SIMD_T>
554 using ScaleFactor = typename SIMD_T::ScaleFactor;
555 template <typename SIMD_T>
556 using RoundMode = typename SIMD_T::RoundMode;
557 template <typename SIMD_T>
558 using Float = typename SIMD_T::Float;
559 template <typename SIMD_T>
560 using Double = typename SIMD_T::Double;
561 template <typename SIMD_T>
562 using Integer = typename SIMD_T::Integer;
563 template <typename SIMD_T>
564 using Vec4 = typename SIMD_T::Vec4;
565 template <typename SIMD_T>
566 using Mask = typename SIMD_T::Mask;