swr/rast: constify swr rasterizer
[mesa.git] / src / gallium / drivers / swr / rasterizer / common / simdlib.hpp
1 /****************************************************************************
2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23 #pragma once
24
25 #include "simdlib_types.hpp"
26
27 // For documentation, please see the following include...
28 // #include "simdlib_interface.hpp"
29
30 namespace SIMDImpl
31 {
32 namespace SIMD128Impl
33 {
34 #if SIMD_ARCH >= SIMD_ARCH_AVX
35 struct AVXImpl
36 {
37 #define __SIMD_LIB_AVX_HPP__
38 #include "simdlib_128_avx.inl"
39 #undef __SIMD_LIB_AVX_HPP__
40 }; // struct AVXImpl
41 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
42
43
44 #if SIMD_ARCH >= SIMD_ARCH_AVX2
45 struct AVX2Impl : AVXImpl
46 {
47 #define __SIMD_LIB_AVX2_HPP__
48 #include "simdlib_128_avx2.inl"
49 #undef __SIMD_LIB_AVX2_HPP__
50 }; // struct AVX2Impl
51 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
52
53 #if SIMD_ARCH >= SIMD_ARCH_AVX512
54 struct AVX512Impl : AVX2Impl
55 {
56 #if defined(SIMD_OPT_128_AVX512)
57 #define __SIMD_LIB_AVX512_HPP__
58 #include "simdlib_128_avx512.inl"
59 #if defined(SIMD_ARCH_KNIGHTS)
60 #include "simdlib_128_avx512_knights.inl"
61 #else // optimize for core
62 #include "simdlib_128_avx512_core.inl"
63 #endif // defined(SIMD_ARCH_KNIGHTS)
64 #undef __SIMD_LIB_AVX512_HPP__
65 #endif // SIMD_OPT_128_AVX512
66 }; // struct AVX2Impl
67 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
68
69 struct Traits : SIMDImpl::Traits
70 {
71 #if SIMD_ARCH == SIMD_ARCH_AVX
72 using IsaImpl = AVXImpl;
73 #elif SIMD_ARCH == SIMD_ARCH_AVX2
74 using IsaImpl = AVX2Impl;
75 #elif SIMD_ARCH == SIMD_ARCH_AVX512
76 using IsaImpl = AVX512Impl;
77 #else
78 #error Invalid value for SIMD_ARCH
79 #endif
80
81 using Float = SIMD128Impl::Float;
82 using Double = SIMD128Impl::Double;
83 using Integer = SIMD128Impl::Integer;
84 using Vec4 = SIMD128Impl::Vec4;
85 using Mask = SIMD128Impl::Mask;
86 };
87 } // ns SIMD128Impl
88
89 namespace SIMD256Impl
90 {
91 #if SIMD_ARCH >= SIMD_ARCH_AVX
92 struct AVXImpl
93 {
94 #define __SIMD_LIB_AVX_HPP__
95 #include "simdlib_256_avx.inl"
96 #undef __SIMD_LIB_AVX_HPP__
97 }; // struct AVXImpl
98 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
99
100
101 #if SIMD_ARCH >= SIMD_ARCH_AVX2
102 struct AVX2Impl : AVXImpl
103 {
104 #define __SIMD_LIB_AVX2_HPP__
105 #include "simdlib_256_avx2.inl"
106 #undef __SIMD_LIB_AVX2_HPP__
107 }; // struct AVX2Impl
108 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
109
110 #if SIMD_ARCH >= SIMD_ARCH_AVX512
111 struct AVX512Impl : AVX2Impl
112 {
113 #if defined(SIMD_OPT_256_AVX512)
114 #define __SIMD_LIB_AVX512_HPP__
115 #include "simdlib_256_avx512.inl"
116 #if defined(SIMD_ARCH_KNIGHTS)
117 #include "simdlib_256_avx512_knights.inl"
118 #else // optimize for core
119 #include "simdlib_256_avx512_core.inl"
120 #endif // defined(SIMD_ARCH_KNIGHTS)
121 #undef __SIMD_LIB_AVX512_HPP__
122 #endif // SIMD_OPT_256_AVX512
123 }; // struct AVX2Impl
124 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
125
126 struct Traits : SIMDImpl::Traits
127 {
128 #if SIMD_ARCH == SIMD_ARCH_AVX
129 using IsaImpl = AVXImpl;
130 #elif SIMD_ARCH == SIMD_ARCH_AVX2
131 using IsaImpl = AVX2Impl;
132 #elif SIMD_ARCH == SIMD_ARCH_AVX512
133 using IsaImpl = AVX512Impl;
134 #else
135 #error Invalid value for SIMD_ARCH
136 #endif
137
138 using Float = SIMD256Impl::Float;
139 using Double = SIMD256Impl::Double;
140 using Integer = SIMD256Impl::Integer;
141 using Vec4 = SIMD256Impl::Vec4;
142 using Mask = SIMD256Impl::Mask;
143 };
144 } // ns SIMD256Impl
145
146 namespace SIMD512Impl
147 {
148 #if SIMD_ARCH >= SIMD_ARCH_AVX
149 template<typename SIMD256T>
150 struct AVXImplBase
151 {
152 #define __SIMD_LIB_AVX_HPP__
153 #include "simdlib_512_emu.inl"
154 #include "simdlib_512_emu_masks.inl"
155 #undef __SIMD_LIB_AVX_HPP__
156 }; // struct AVXImplBase
157 using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
158 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
159
160
161 #if SIMD_ARCH >= SIMD_ARCH_AVX2
162 using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
163 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
164
165
166 #if SIMD_ARCH >= SIMD_ARCH_AVX512
167 struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
168 {
169 #define __SIMD_LIB_AVX512_HPP__
170 #include "simdlib_512_avx512.inl"
171 #include "simdlib_512_avx512_masks.inl"
172 #if defined(SIMD_ARCH_KNIGHTS)
173 #include "simdlib_512_avx512_knights.inl"
174 #include "simdlib_512_avx512_masks_knights.inl"
175 #else // optimize for core
176 #include "simdlib_512_avx512_core.inl"
177 #include "simdlib_512_avx512_masks_core.inl"
178 #endif // defined(SIMD_ARCH_KNIGHTS)
179 #undef __SIMD_LIB_AVX512_HPP__
180 }; // struct AVX512ImplBase
181 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
182
183 struct Traits : SIMDImpl::Traits
184 {
185 #if SIMD_ARCH == SIMD_ARCH_AVX
186 using IsaImpl = AVXImpl;
187 #elif SIMD_ARCH == SIMD_ARCH_AVX2
188 using IsaImpl = AVX2Impl;
189 #elif SIMD_ARCH == SIMD_ARCH_AVX512
190 using IsaImpl = AVX512Impl;
191 #else
192 #error Invalid value for SIMD_ARCH
193 #endif
194
195 using Float = SIMD512Impl::Float;
196 using Double = SIMD512Impl::Double;
197 using Integer = SIMD512Impl::Integer;
198 using Vec4 = SIMD512Impl::Vec4;
199 using Mask = SIMD512Impl::Mask;
200 };
201 } // ns SIMD512Impl
202 } // ns SIMDImpl
203
204 template <typename Traits>
205 struct SIMDBase : Traits::IsaImpl
206 {
207 using CompareType = typename Traits::CompareType;
208 using ScaleFactor = typename Traits::ScaleFactor;
209 using RoundMode = typename Traits::RoundMode;
210 using SIMD = typename Traits::IsaImpl;
211 using Float = typename Traits::Float;
212 using Double = typename Traits::Double;
213 using Integer = typename Traits::Integer;
214 using Vec4 = typename Traits::Vec4;
215 using Mask = typename Traits::Mask;
216
217 // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
218 static SIMDINLINE
219 void vec4_load1_ps(Vec4& r, const float *p)
220 {
221 r[0] = SIMD::set1_ps(p[0]);
222 r[1] = SIMD::set1_ps(p[1]);
223 r[2] = SIMD::set1_ps(p[2]);
224 r[3] = SIMD::set1_ps(p[3]);
225 }
226
227 static SIMDINLINE
228 void vec4_set1_vps(Vec4& r, Float const &s)
229 {
230 r[0] = s;
231 r[1] = s;
232 r[2] = s;
233 r[3] = s;
234 }
235
236 static SIMDINLINE
237 Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
238 {
239 Float tmp, r;
240 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
241
242 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
243 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
244
245 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
246 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
247
248 return r;
249 }
250
251 static SIMDINLINE
252 Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
253 {
254 Float tmp, r;
255 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
256
257 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
258 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
259
260 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
261 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
262
263 tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
264 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
265
266 return r;
267 }
268
269 static SIMDINLINE
270 Float vec4_rcp_length_ps(const Vec4& v)
271 {
272 Float length = vec4_dp4_ps(v, v);
273 return SIMD::rsqrt_ps(length);
274 }
275
276 static SIMDINLINE
277 void vec4_normalize_ps(Vec4& r, const Vec4& v)
278 {
279 Float rcpLength = vec4_rcp_length_ps(v);
280
281 r[0] = SIMD::mul_ps(v[0], rcpLength);
282 r[1] = SIMD::mul_ps(v[1], rcpLength);
283 r[2] = SIMD::mul_ps(v[2], rcpLength);
284 r[3] = SIMD::mul_ps(v[3], rcpLength);
285 }
286
287 static SIMDINLINE
288 void vec4_mul_ps(Vec4& r, const Vec4& v, Float const &s)
289 {
290 r[0] = SIMD::mul_ps(v[0], s);
291 r[1] = SIMD::mul_ps(v[1], s);
292 r[2] = SIMD::mul_ps(v[2], s);
293 r[3] = SIMD::mul_ps(v[3], s);
294 }
295
296 static SIMDINLINE
297 void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
298 {
299 r[0] = SIMD::mul_ps(v0[0], v1[0]);
300 r[1] = SIMD::mul_ps(v0[1], v1[1]);
301 r[2] = SIMD::mul_ps(v0[2], v1[2]);
302 r[3] = SIMD::mul_ps(v0[3], v1[3]);
303 }
304
305 static SIMDINLINE
306 void vec4_add_ps(Vec4& r, const Vec4& v0, Float const &s)
307 {
308 r[0] = SIMD::add_ps(v0[0], s);
309 r[1] = SIMD::add_ps(v0[1], s);
310 r[2] = SIMD::add_ps(v0[2], s);
311 r[3] = SIMD::add_ps(v0[3], s);
312 }
313
314 static SIMDINLINE
315 void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
316 {
317 r[0] = SIMD::add_ps(v0[0], v1[0]);
318 r[1] = SIMD::add_ps(v0[1], v1[1]);
319 r[2] = SIMD::add_ps(v0[2], v1[2]);
320 r[3] = SIMD::add_ps(v0[3], v1[3]);
321 }
322
323 static SIMDINLINE
324 void vec4_min_ps(Vec4& r, const Vec4& v0, Float const &s)
325 {
326 r[0] = SIMD::min_ps(v0[0], s);
327 r[1] = SIMD::min_ps(v0[1], s);
328 r[2] = SIMD::min_ps(v0[2], s);
329 r[3] = SIMD::min_ps(v0[3], s);
330 }
331
332 static SIMDINLINE
333 void vec4_max_ps(Vec4& r, const Vec4& v0, Float const &s)
334 {
335 r[0] = SIMD::max_ps(v0[0], s);
336 r[1] = SIMD::max_ps(v0[1], s);
337 r[2] = SIMD::max_ps(v0[2], s);
338 r[3] = SIMD::max_ps(v0[3], s);
339 }
340
341 // Matrix4x4 * Vector4
342 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
343 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
344 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
345 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
346 static SIMDINLINE
347 void SIMDCALL mat4x4_vec4_multiply(
348 Vec4& result,
349 const float *pMatrix,
350 const Vec4& v)
351 {
352 Float m;
353 Float r0;
354 Float r1;
355
356 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
357 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
358 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
359 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
360 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
361 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
362 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
363 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
364 m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
365 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
366 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
367 result[0] = r0;
368
369 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
370 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
371 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
372 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
373 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
374 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
375 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
376 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
377 m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
378 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
379 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
380 result[1] = r0;
381
382 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
383 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
384 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
385 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
386 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
387 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
388 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
389 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
390 m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
391 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
392 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
393 result[2] = r0;
394
395 m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0]
396 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
397 m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1]
398 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
399 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
400 m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2]
401 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
402 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
403 m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3]
404 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
405 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
406 result[3] = r0;
407 }
408
409 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
410 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
411 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
412 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
413 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
414 static SIMDINLINE
415 void SIMDCALL mat3x3_vec3_w0_multiply(
416 Vec4& result,
417 const float *pMatrix,
418 const Vec4& v)
419 {
420 Float m;
421 Float r0;
422 Float r1;
423
424 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
425 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
426 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
427 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
428 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
429 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
430 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
431 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
432 result[0] = r0;
433
434 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
435 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
436 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
437 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
438 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
439 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
440 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
441 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
442 result[1] = r0;
443
444 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
445 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
446 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
447 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
448 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
449 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
450 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
451 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
452 result[2] = r0;
453
454 result[3] = SIMD::setzero_ps();
455 }
456
457 // Matrix4x4 * Vector3 - Position vector where w = 1.
458 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
459 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
460 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
461 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
462 static SIMDINLINE
463 void SIMDCALL mat4x4_vec3_w1_multiply(
464 Vec4& result,
465 const float *pMatrix,
466 const Vec4& v)
467 {
468 Float m;
469 Float r0;
470 Float r1;
471
472 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
473 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
474 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
475 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
476 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
477 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
478 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
479 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
480 m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
481 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
482 result[0] = r0;
483
484 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
485 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
486 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
487 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
488 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
489 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
490 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
491 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
492 m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
493 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
494 result[1] = r0;
495
496 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
497 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
498 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
499 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
500 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
501 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
502 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
503 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
504 m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
505 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
506 result[2] = r0;
507
508 m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0]
509 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
510 m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1]
511 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
512 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
513 m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2]
514 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
515 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
516 m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3]
517 result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
518 }
519
520 static SIMDINLINE
521 void SIMDCALL mat4x3_vec3_w1_multiply(
522 Vec4& result,
523 const float *pMatrix,
524 const Vec4& v)
525 {
526 Float m;
527 Float r0;
528 Float r1;
529
530 m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
531 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
532 m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
533 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
534 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
535 m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
536 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
537 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
538 m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
539 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
540 result[0] = r0;
541
542 m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
543 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
544 m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
545 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
546 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
547 m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
548 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
549 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
550 m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
551 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
552 result[1] = r0;
553
554 m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
555 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
556 m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
557 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
558 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
559 m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
560 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
561 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
562 m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
563 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
564 result[2] = r0;
565 result[3] = SIMD::set1_ps(1.0f);
566 }
567 }; // struct SIMDBase
568
569 using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
570 using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
571 using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;