swr/rast: fix core / knights split of AVX512 intrinsics
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simdlib.hpp
1 /****************************************************************************
2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23 #pragma once
24
25 #include "simdlib_types.hpp"
26
27 // For documentation, please see the following include...
28 // #include "simdlib_interface.hpp"
29
30 namespace SIMDImpl
31 {
32 namespace SIMD128Impl
33 {
34 #if SIMD_ARCH >= SIMD_ARCH_AVX
35 struct AVXImpl
36 {
37 #define __SIMD_LIB_AVX_HPP__
38 #include "simdlib_128_avx.inl"
39 #undef __SIMD_LIB_AVX_HPP__
40 }; // struct AVXImpl
41 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
42
43
44 #if SIMD_ARCH >= SIMD_ARCH_AVX2
45 struct AVX2Impl : AVXImpl
46 {
47 #define __SIMD_LIB_AVX2_HPP__
48 #include "simdlib_128_avx2.inl"
49 #undef __SIMD_LIB_AVX2_HPP__
50 }; // struct AVX2Impl
51 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
52
53 #if SIMD_ARCH >= SIMD_ARCH_AVX512
54 struct AVX512Impl : AVX2Impl
55 {
56 #if defined(SIMD_OPT_128_AVX512)
57 #define __SIMD_LIB_AVX512_HPP__
58 #include "simdlib_128_avx512.inl"
59 #if defined(SIMD_ARCH_KNIGHTS)
60 #include "simdlib_128_avx512_knights.inl"
61 #else // optimize for core
62 #include "simdlib_128_avx512_core.inl"
63 #endif // defined(SIMD_ARCH_KNIGHTS)
64 #undef __SIMD_LIB_AVX512_HPP__
65 #endif // SIMD_OPT_128_AVX512
66 }; // struct AVX2Impl
67 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
68
69 struct Traits : SIMDImpl::Traits
70 {
71 #if SIMD_ARCH == SIMD_ARCH_AVX
72 using IsaImpl = AVXImpl;
73 #elif SIMD_ARCH == SIMD_ARCH_AVX2
74 using IsaImpl = AVX2Impl;
75 #elif SIMD_ARCH == SIMD_ARCH_AVX512
76 using IsaImpl = AVX512Impl;
77 #else
78 #error Invalid value for SIMD_ARCH
79 #endif
80
81 using Float = SIMD128Impl::Float;
82 using Double = SIMD128Impl::Double;
83 using Integer = SIMD128Impl::Integer;
84 using Vec4 = SIMD128Impl::Vec4;
85 using Mask = SIMD128Impl::Mask;
86 };
87 } // ns SIMD128Impl
88
89 namespace SIMD256Impl
90 {
91 #if SIMD_ARCH >= SIMD_ARCH_AVX
92 struct AVXImpl
93 {
94 #define __SIMD_LIB_AVX_HPP__
95 #include "simdlib_256_avx.inl"
96 #undef __SIMD_LIB_AVX_HPP__
97 }; // struct AVXImpl
98 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
99
100
101 #if SIMD_ARCH >= SIMD_ARCH_AVX2
102 struct AVX2Impl : AVXImpl
103 {
104 #define __SIMD_LIB_AVX2_HPP__
105 #include "simdlib_256_avx2.inl"
106 #undef __SIMD_LIB_AVX2_HPP__
107 }; // struct AVX2Impl
108 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
109
110 #if SIMD_ARCH >= SIMD_ARCH_AVX512
111 struct AVX512Impl : AVX2Impl
112 {
113 #if defined(SIMD_OPT_256_AVX512)
114 #define __SIMD_LIB_AVX512_HPP__
115 #include "simdlib_256_avx512.inl"
116 #if defined(SIMD_ARCH_KNIGHTS)
117 #include "simdlib_256_avx512_knights.inl"
118 #else // optimize for core
119 #include "simdlib_256_avx512_core.inl"
120 #endif // defined(SIMD_ARCH_KNIGHTS)
121 #undef __SIMD_LIB_AVX512_HPP__
122 #endif // SIMD_OPT_256_AVX512
123 }; // struct AVX2Impl
124 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
125
126 struct Traits : SIMDImpl::Traits
127 {
128 #if SIMD_ARCH == SIMD_ARCH_AVX
129 using IsaImpl = AVXImpl;
130 #elif SIMD_ARCH == SIMD_ARCH_AVX2
131 using IsaImpl = AVX2Impl;
132 #elif SIMD_ARCH == SIMD_ARCH_AVX512
133 using IsaImpl = AVX512Impl;
134 #else
135 #error Invalid value for SIMD_ARCH
136 #endif
137
138 using Float = SIMD256Impl::Float;
139 using Double = SIMD256Impl::Double;
140 using Integer = SIMD256Impl::Integer;
141 using Vec4 = SIMD256Impl::Vec4;
142 using Mask = SIMD256Impl::Mask;
143 };
144 } // ns SIMD256Impl
145
146 namespace SIMD512Impl
147 {
148 #if SIMD_ARCH >= SIMD_ARCH_AVX
149 template<typename SIMD256T>
150 struct AVXImplBase
151 {
152 #define __SIMD_LIB_AVX_HPP__
153 #include "simdlib_512_emu.inl"
154 #include "simdlib_512_emu_masks.inl"
155 #undef __SIMD_LIB_AVX_HPP__
156 }; // struct AVXImplBase
157 using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
158 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
159
160
161 #if SIMD_ARCH >= SIMD_ARCH_AVX2
162 using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
163 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
164
165
166 #if SIMD_ARCH >= SIMD_ARCH_AVX512
167 struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
168 {
169 #define __SIMD_LIB_AVX512_HPP__
170 #include "simdlib_512_avx512.inl"
171 #include "simdlib_512_avx512_masks.inl"
172 #if defined(SIMD_ARCH_KNIGHTS)
173 #include "simdlib_512_avx512_knights.inl"
174 #include "simdlib_512_avx512_masks_knights.inl"
175 #else // optimize for core
176 #include "simdlib_512_avx512_core.inl"
177 #include "simdlib_512_avx512_masks_core.inl"
178 #endif // defined(SIMD_ARCH_KNIGHTS)
179 #undef __SIMD_LIB_AVX512_HPP__
180 }; // struct AVX512ImplBase
181 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
182
183 struct Traits : SIMDImpl::Traits
184 {
185 #if SIMD_ARCH == SIMD_ARCH_AVX
186 using IsaImpl = AVXImpl;
187 #elif SIMD_ARCH == SIMD_ARCH_AVX2
188 using IsaImpl = AVX2Impl;
189 #elif SIMD_ARCH == SIMD_ARCH_AVX512
190 using IsaImpl = AVX512Impl;
191 #else
192 #error Invalid value for SIMD_ARCH
193 #endif
194
195 using Float = SIMD512Impl::Float;
196 using Double = SIMD512Impl::Double;
197 using Integer = SIMD512Impl::Integer;
198 using Vec4 = SIMD512Impl::Vec4;
199 using Mask = SIMD512Impl::Mask;
200 };
201 } // ns SIMD512Impl
202 } // ns SIMDImpl
203
204 template <typename Traits>
205 struct SIMDBase : Traits::IsaImpl
206 {
207 using CompareType = typename Traits::CompareType;
208 using ScaleFactor = typename Traits::ScaleFactor;
209 using RoundMode = typename Traits::RoundMode;
210 using SIMD = typename Traits::IsaImpl;
211 using Float = typename Traits::Float;
212 using Double = typename Traits::Double;
213 using Integer = typename Traits::Integer;
214 using Vec4 = typename Traits::Vec4;
215 using Mask = typename Traits::Mask;
216
217 static const size_t VECTOR_BYTES = sizeof(Float);
218
219 // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
220 static SIMDINLINE
221 void vec4_load1_ps(Vec4& r, const float *p)
222 {
223 r[0] = SIMD::set1_ps(p[0]);
224 r[1] = SIMD::set1_ps(p[1]);
225 r[2] = SIMD::set1_ps(p[2]);
226 r[3] = SIMD::set1_ps(p[3]);
227 }
228
229 static SIMDINLINE
230 void vec4_set1_vps(Vec4& r, Float const &s)
231 {
232 r[0] = s;
233 r[1] = s;
234 r[2] = s;
235 r[3] = s;
236 }
237
238 static SIMDINLINE
239 Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
240 {
241 Float tmp, r;
242 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
243
244 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
245 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
246
247 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
248 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
249
250 return r;
251 }
252
253 static SIMDINLINE
254 Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
255 {
256 Float tmp, r;
257 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
258
259 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
260 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
261
262 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
263 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
264
265 tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
266 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
267
268 return r;
269 }
270
271 static SIMDINLINE
272 Float vec4_rcp_length_ps(const Vec4& v)
273 {
274 Float length = vec4_dp4_ps(v, v);
275 return SIMD::rsqrt_ps(length);
276 }
277
278 static SIMDINLINE
279 void vec4_normalize_ps(Vec4& r, const Vec4& v)
280 {
281 Float rcpLength = vec4_rcp_length_ps(v);
282
283 r[0] = SIMD::mul_ps(v[0], rcpLength);
284 r[1] = SIMD::mul_ps(v[1], rcpLength);
285 r[2] = SIMD::mul_ps(v[2], rcpLength);
286 r[3] = SIMD::mul_ps(v[3], rcpLength);
287 }
288
289 static SIMDINLINE
290 void vec4_mul_ps(Vec4& r, const Vec4& v, Float const &s)
291 {
292 r[0] = SIMD::mul_ps(v[0], s);
293 r[1] = SIMD::mul_ps(v[1], s);
294 r[2] = SIMD::mul_ps(v[2], s);
295 r[3] = SIMD::mul_ps(v[3], s);
296 }
297
298 static SIMDINLINE
299 void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
300 {
301 r[0] = SIMD::mul_ps(v0[0], v1[0]);
302 r[1] = SIMD::mul_ps(v0[1], v1[1]);
303 r[2] = SIMD::mul_ps(v0[2], v1[2]);
304 r[3] = SIMD::mul_ps(v0[3], v1[3]);
305 }
306
307 static SIMDINLINE
308 void vec4_add_ps(Vec4& r, const Vec4& v0, Float const &s)
309 {
310 r[0] = SIMD::add_ps(v0[0], s);
311 r[1] = SIMD::add_ps(v0[1], s);
312 r[2] = SIMD::add_ps(v0[2], s);
313 r[3] = SIMD::add_ps(v0[3], s);
314 }
315
316 static SIMDINLINE
317 void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
318 {
319 r[0] = SIMD::add_ps(v0[0], v1[0]);
320 r[1] = SIMD::add_ps(v0[1], v1[1]);
321 r[2] = SIMD::add_ps(v0[2], v1[2]);
322 r[3] = SIMD::add_ps(v0[3], v1[3]);
323 }
324
325 static SIMDINLINE
326 void vec4_min_ps(Vec4& r, const Vec4& v0, Float const &s)
327 {
328 r[0] = SIMD::min_ps(v0[0], s);
329 r[1] = SIMD::min_ps(v0[1], s);
330 r[2] = SIMD::min_ps(v0[2], s);
331 r[3] = SIMD::min_ps(v0[3], s);
332 }
333
334 static SIMDINLINE
335 void vec4_max_ps(Vec4& r, const Vec4& v0, Float const &s)
336 {
337 r[0] = SIMD::max_ps(v0[0], s);
338 r[1] = SIMD::max_ps(v0[1], s);
339 r[2] = SIMD::max_ps(v0[2], s);
340 r[3] = SIMD::max_ps(v0[3], s);
341 }
342
343 // Matrix4x4 * Vector4
344 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
345 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
346 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
347 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
348 static SIMDINLINE
349 void SIMDCALL mat4x4_vec4_multiply(
350 Vec4& result,
351 const float *pMatrix,
352 const Vec4& v)
353 {
354 Float m;
355 Float r0;
356 Float r1;
357
358 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
359 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
360 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
361 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
362 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
363 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
364 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
365 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
366 m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
367 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
368 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
369 result[0] = r0;
370
371 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
372 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
373 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
374 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
375 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
376 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
377 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
378 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
379 m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
380 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
381 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
382 result[1] = r0;
383
384 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
385 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
386 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
387 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
388 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
389 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
390 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
391 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
392 m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
393 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
394 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
395 result[2] = r0;
396
397 m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0]
398 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
399 m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1]
400 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
401 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
402 m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2]
403 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
404 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
405 m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3]
406 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
407 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
408 result[3] = r0;
409 }
410
411 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
412 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
413 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
414 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
415 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
416 static SIMDINLINE
417 void SIMDCALL mat3x3_vec3_w0_multiply(
418 Vec4& result,
419 const float *pMatrix,
420 const Vec4& v)
421 {
422 Float m;
423 Float r0;
424 Float r1;
425
426 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
427 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
428 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
429 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
430 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
431 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
432 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
433 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
434 result[0] = r0;
435
436 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
437 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
438 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
439 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
440 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
441 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
442 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
443 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
444 result[1] = r0;
445
446 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
447 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
448 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
449 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
450 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
451 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
452 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
453 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
454 result[2] = r0;
455
456 result[3] = SIMD::setzero_ps();
457 }
458
459 // Matrix4x4 * Vector3 - Position vector where w = 1.
460 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
461 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
462 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
463 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
464 static SIMDINLINE
465 void SIMDCALL mat4x4_vec3_w1_multiply(
466 Vec4& result,
467 const float *pMatrix,
468 const Vec4& v)
469 {
470 Float m;
471 Float r0;
472 Float r1;
473
474 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
475 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
476 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
477 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
478 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
479 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
480 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
481 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
482 m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
483 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
484 result[0] = r0;
485
486 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
487 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
488 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
489 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
490 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
491 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
492 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
493 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
494 m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
495 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
496 result[1] = r0;
497
498 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
499 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
500 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
501 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
502 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
503 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
504 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
505 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
506 m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
507 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
508 result[2] = r0;
509
510 m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0]
511 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
512 m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1]
513 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
514 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
515 m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2]
516 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
517 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
518 m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3]
519 result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
520 }
521
522 static SIMDINLINE
523 void SIMDCALL mat4x3_vec3_w1_multiply(
524 Vec4& result,
525 const float *pMatrix,
526 const Vec4& v)
527 {
528 Float m;
529 Float r0;
530 Float r1;
531
532 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
533 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
534 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
535 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
536 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
537 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
538 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
539 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
540 m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
541 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
542 result[0] = r0;
543
544 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
545 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
546 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
547 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
548 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
549 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
550 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
551 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
552 m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
553 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
554 result[1] = r0;
555
556 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
557 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
558 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
559 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
560 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
561 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
562 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
563 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
564 m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
565 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
566 result[2] = r0;
567 result[3] = SIMD::set1_ps(1.0f);
568 }
569 }; // struct SIMDBase
570
571 using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
572 using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
573 using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;