src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp

   1 /****************************************************************************
   2  * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  ****************************************************************************/
  23 #pragma once
  24 #if 0
  25 //===========================================================================
  26 // Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
  27 //===========================================================================
  28 struct SIMD256 // or SIMD4 or SIMD16
  29 {
  30     //=======================================================================
  31     // SIMD Types
  32     //
  33     // These typedefs are examples. The SIMD256 and SIMD16 implementations will
  34     // use different base types with this same naming.
  35     using Float     = __m256;  // Packed single-precision float vector
  36     using Double    = __m256d; // Packed double-precision float vector
  37     using Integer   = __m256i; // Packed integer vector (mutable element widths)
  38     using Mask      = uint8_t; // Integer representing mask bits
  39
  40     //=======================================================================
  41     // Standard interface
  42     // (available in both SIMD256 and SIMD16 widths)
  43     //=======================================================================
  44
  45     //-----------------------------------------------------------------------
  46     // Single precision floating point arithmetic operations
  47     //-----------------------------------------------------------------------
  48     static Float    add_ps(Float a, Float b);               // return a + b
  49     static Float    div_ps(Float a, Float b);               // return a / b
  50     static Float    fmadd_ps(Float a, Float b, Float c);    // return (a * b) + c
  51     static Float    fmsub_ps(Float a, Float b, Float c);    // return (a * b) - c
  52     static Float    max_ps(Float a, Float b);               // return (a > b) ? a : b
  53     static Float    min_ps(Float a, Float b);               // return (a < b) ? a : b
  54     static Float    mul_ps(Float a, Float b);               // return a * b
  55     static Float    rcp_ps(Float a);                        // return 1.0f / a
  56     static Float    rsqrt_ps(Float a);                      // return 1.0f / sqrt(a)
  57     static Float    sub_ps(Float a, Float b);               // return a - b
  58
  59     enum class RoundMode
  60     {
  61         TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
  62         TO_NEG_INF      = 0x01, // Round to negative infinity
  63         TO_POS_INF      = 0x02, // Round to positive infinity
  64         TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
  65         CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
  66
  67         RAISE_EXC       = 0x00, // Raise exception on overflow
  68         NO_EXC          = 0x08, // Suppress exceptions
  69
  70         NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
  71         NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
  72         FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
  73         FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
  74         CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
  75         CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
  76         TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
  77         TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
  78         RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
  79         NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
  80     };
  81
  82     // return round_func(a)
  83     //
  84     // round_func is chosen on the RMT template parameter.  See the documentation
  85     // for the RoundMode enumeration above.
  86     template <RoundMode RMT>
  87     static Float    round_ps(Float a);                  // return round(a)
  88
  89
  90     //-----------------------------------------------------------------------
  91     // Integer (various width) arithmetic operations
  92     //-----------------------------------------------------------------------
  93     static Integer  abs_epi32(Integer a);               // return absolute_value(a) (int32)
  94     static Integer  add_epi32(Integer a, Integer b);    // return a + b (int32)
  95     static Integer  add_epi8(Integer a, Integer b);     // return a + b (int8)
  96     static Integer  adds_epu8(Integer a, Integer b);    // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
  97     static Integer  max_epi32(Integer a, Integer b);    // return (a > b) ? a : b (int32)
  98     static Integer  max_epu32(Integer a, Integer b);    // return (a > b) ? a : b (uint32)
  99     static Integer  min_epi32(Integer a, Integer b);    // return (a < b) ? a : b (int32)
 100     static Integer  min_epu32(Integer a, Integer b);    // return (a < b) ? a : b (uint32)
 101     static Integer  mul_epi32(Integer a, Integer b);    // return a * b (int32)
 102
 103     // return (a * b) & 0xFFFFFFFF
 104     //
 105     // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
 106     // and store the low 32 bits of the intermediate integers in dst.
 107     static Float    mullo_epi32(Integer a, Integer b);
 108
 109     static Integer  sub_epi32(Integer a, Integer b);    // return a - b (int32)
 110     static Integer  sub_epi64(Integer a, Integer b);    // return a - b (int64)
 111     static Integer  subs_epu8(Integer a, Integer b);    // return (b > a) ? 0 : (a - b) (uint8)
 112
 113     //-----------------------------------------------------------------------
 114     // Logical operations
 115     //-----------------------------------------------------------------------
 116     static Float    and_ps(Float a, Float b);           // return a & b       (float treated as int)
 117     static Integer  and_si(Integer a, Integer b);       // return a & b       (int)
 118     static Float    andnot_ps(Float a, Float b);        // return (~a) & b    (float treated as int)
 119     static Integer  andnot_si(Integer a, Integer b);    // return (~a) & b    (int)
 120     static Float    or_ps(Float a, Float b);            // return a | b       (float treated as int)
 121     static Float    or_si(Integer a, Integer b);        // return a | b       (int)
 122     static Float    xor_ps(Float a, Float b);           // return a ^ b       (float treated as int)
 123     static Integer  xor_si(Integer a, Integer b);       // return a ^ b       (int)
 124
 125     //-----------------------------------------------------------------------
 126     // Shift operations
 127     //-----------------------------------------------------------------------
 128     template<int ImmT>
 129     static Integer  slli_epi32(Integer a);              // return a << ImmT
 130     static Integer  sllv_epi32(Integer a, Integer b);   // return a << b
 131     template<int ImmT>
 132     static Integer  srai_epi32(Integer a);              // return a >> ImmT   (int32)
 133     template<int ImmT>
 134     static Integer  srli_epi32(Integer a);              // return a >> ImmT   (uint32)
 135     template<int ImmT>                                  // for each 128-bit lane:
 136     static Integer  srli_si(Integer a);                 //  return a >> (ImmT*8) (uint)
 137     template<int ImmT>
 138     static Float    srlisi_ps(Float a);                 // same as srli_si, but with Float cast to int
 139     static Integer  srlv_epi32(Integer a, Integer b);   // return a >> b      (uint32)
 140
 141     //-----------------------------------------------------------------------
 142     // Conversion operations
 143     //-----------------------------------------------------------------------
 144     static Float    castpd_ps(Double a);                // return *(Float*)(&a)
 145     static Integer  castps_si(Float a);                 // return *(Integer*)(&a)
 146     static Double   castsi_pd(Integer a);               // return *(Double*)(&a)
 147     static Double   castps_pd(Float a);                 // return *(Double*)(&a)
 148     static Float    castsi_ps(Integer a);               // return *(Float*)(&a)
 149     static Float    cvtepi32_ps(Integer a);             // return (float)a    (int32 --> float)
 150     static Integer  cvtepu8_epi16(Integer a);           // return (int16)a    (uint8 --> int16)
 151     static Integer  cvtepu8_epi32(Integer a);           // return (int32)a    (uint8 --> int32)
 152     static Integer  cvtepu16_epi32(Integer a);          // return (int32)a    (uint16 --> int32)
 153     static Integer  cvtepu16_epi64(Integer a);          // return (int64)a    (uint16 --> int64)
 154     static Integer  cvtepu32_epi64(Integer a);          // return (int64)a    (uint32 --> int64)
 155     static Integer  cvtps_epi32(Float a);               // return (int32)a    (float --> int32)
 156     static Integer  cvttps_epi32(Float a);              // return (int32)a    (rnd_to_zero(float) --> int32)
 157
 158     //-----------------------------------------------------------------------
 159     // Comparison operations
 160     //-----------------------------------------------------------------------
 161
 162     // Comparison types used with cmp_ps:
 163     //   - ordered comparisons are always false if either operand is NaN
 164     //   - unordered comparisons are always true if either operand is NaN
 165     //   - signaling comparisons raise an exception if either operand is NaN
 166     //   - non-signaling comparisons will never raise an exception
 167     //
 168     // Ordered:     return (a != NaN) && (b != NaN) && (a cmp b)
 169     // Unordered:   return (a == NaN) || (b == NaN) || (a cmp b)
 170     enum class CompareType
 171     {
 172         EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
 173         LT_OS      = 0x01, // Less-than (ordered, signaling)
 174         LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
 175         UNORD_Q    = 0x03, // Unordered (nonsignaling)
 176         NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
 177         NLT_US     = 0x05, // Not-less-than (unordered, signaling)
 178         NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
 179         ORD_Q      = 0x07, // Ordered (nonsignaling)
 180         EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
 181         NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
 182         NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
 183         FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
 184         NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
 185         GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
 186         GT_OS      = 0x0E, // Greater-than (ordered, signaling)
 187         TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
 188         EQ_OS      = 0x10, // Equal (ordered, signaling)
 189         LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
 190         LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
 191         UNORD_S    = 0x13, // Unordered (signaling)
 192         NEQ_US     = 0x14, // Not-equal (unordered, signaling)
 193         NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
 194         NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
 195         ORD_S      = 0x17, // Ordered (signaling)
 196         EQ_US      = 0x18, // Equal (unordered, signaling)
 197         NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
 198         NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
 199         FALSE_OS   = 0x1B, // False (ordered, signaling)
 200         NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
 201         GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
 202         GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
 203         TRUE_US    = 0x1F, // True (unordered, signaling)
 204     };
 205
 206     // return a (CmpTypeT) b (float)
 207     //
 208     // See documentation for CompareType above for valid values for CmpTypeT.
 209     template<CompareType CmpTypeT>
 210     static Float    cmp_ps(Float a, Float b);           // return a (CmtTypeT) b (see above)
 211     static Float    cmpgt_ps(Float a, Float b);         // return cmp_ps<CompareType::GT_OQ>(a, b)
 212     static Float    cmple_ps(Float a, Float b);         // return cmp_ps<CompareType::LE_OQ>(a, b)
 213     static Float    cmplt_ps(Float a, Float b);         // return cmp_ps<CompareType::LT_OQ>(a, b)
 214     static Float    cmpneq_ps(Float a, Float b);        // return cmp_ps<CompareType::NEQ_OQ>(a, b)
 215     static Float    cmpeq_ps(Float a, Float b);         // return cmp_ps<CompareType::EQ_OQ>(a, b)
 216     static Float    cmpge_ps(Float a, Float b);         // return cmp_ps<CompareType::GE_OQ>(a, b)
 217     static Integer  cmpeq_epi8(Integer a, Integer b);   // return a == b (int8)
 218     static Integer  cmpeq_epi16(Integer a, Integer b);  // return a == b (int16)
 219     static Integer  cmpeq_epi32(Integer a, Integer b);  // return a == b (int32)
 220     static Integer  cmpeq_epi64(Integer a, Integer b);  // return a == b (int64)
 221     static Integer  cmpgt_epi8(Integer a, Integer b);   // return a > b (int8)
 222     static Integer  cmpgt_epi16(Integer a, Integer b);  // return a > b (int16)
 223     static Integer  cmpgt_epi32(Integer a, Integer b);  // return a > b (int32)
 224     static Integer  cmpgt_epi64(Integer a, Integer b);  // return a > b (int64)
 225     static Integer  cmplt_epi32(Integer a, Integer b);  // return a < b (int32)
 226     static bool     testz_ps(Float a, Float b);         // return all_lanes_zero(a & b) ? 1 : 0 (float)
 227     static bool     testz_si(Integer a, Integer b);     // return all_lanes_zero(a & b) ? 1 : 0 (int)
 228
 229     //-----------------------------------------------------------------------
 230     // Blend / shuffle / permute operations
 231     //-----------------------------------------------------------------------
 232     template<int ImmT>
 233     static Float    blend_ps(Float a, Float b);                     // return ImmT ? b : a  (float)
 234     static Integer  blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
 235     static Float    blendv_ps(Float a, Float b, Float mask);        // return mask ? b : a (float)
 236     static Float    broadcast_ss(float const *p);                   // return *p (all elements in vector get same value)
 237     static Integer  packs_epi16(Integer a, Integer b);              // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
 238     static Integer  packs_epi32(Integer a, Integer b);              // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
 239     static Integer  packus_epi16(Integer a, Integer b);             // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
 240     static Integer  packus_epi32(Integer a, Integer b);             // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 241     static Float    permute_epi32(Integer a, Integer swiz);         // return a[swiz[i]] for each 32-bit lane i (int32)
 242     static Float    permute_ps(Float a, Integer swiz);              // return a[swiz[i]] for each 32-bit lane i (float)
 243     template<int SwizT>
 244     static Integer  shuffle_epi32(Integer a, Integer b);
 245     template<int SwizT>
 246     static Integer  shuffle_epi64(Integer a, Integer b);
 247     static Integer  shuffle_epi8(Integer a, Integer b);
 248     template<int SwizT>
 249     static Float    shuffle_pd(Double a, Double b);
 250     template<int SwizT>
 251     static Float    shuffle_ps(Float a, Float b);
 252     static Integer  unpackhi_epi16(Integer a, Integer b);
 253     static Integer  unpackhi_epi32(Integer a, Integer b);
 254     static Integer  unpackhi_epi64(Integer a, Integer b);
 255     static Integer  unpackhi_epi8(Integer a, Integer b);
 256     static Float    unpackhi_pd(Double a, Double b);
 257     static Float    unpackhi_ps(Float a, Float b);
 258     static Integer  unpacklo_epi16(Integer a, Integer b);
 259     static Integer  unpacklo_epi32(Integer a, Integer b);
 260     static Integer  unpacklo_epi64(Integer a, Integer b);
 261     static Integer  unpacklo_epi8(Integer a, Integer b);
 262     static Float    unpacklo_pd(Double a, Double b);
 263     static Float    unpacklo_ps(Float a, Float b);
 264
 265     //-----------------------------------------------------------------------
 266     // Load / store operations
 267     //-----------------------------------------------------------------------
 268     enum class ScaleFactor
 269     {
 270         SF_1,   // No scaling
 271         SF_2,   // Scale offset by 2
 272         SF_4,   // Scale offset by 4
 273         SF_8,   // Scale offset by 8
 274     };
 275
 276     template<ScaleFactor ScaleT = ScaleFactor::SF_1>
 277     static Float    i32gather_ps(float const* p, Integer idx);  // return *(float*)(((int8*)p) + (idx * ScaleT))
 278     static Float    load1_ps(float const *p);                   // return *p    (broadcast 1 value to all elements)
 279     static Float    load_ps(float const *p);                    // return *p    (loads SIMD width elements from memory)
 280     static Integer  load_si(Integer const *p);                  // return *p
 281     static Float    loadu_ps(float const *p);                   // return *p    (same as load_ps but allows for unaligned mem)
 282     static Integer  loadu_si(Integer const *p);                 // return *p    (same as load_si but allows for unaligned mem)
 283
 284     // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
 285     template<int ScaleT>
 286     static Float    mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
 287
 288     static void     maskstore_ps(float *p, Integer mask, Float src);
 289     static int      movemask_epi8(Integer a);
 290     static int      movemask_pd(Double a);
 291     static int      movemask_ps(Float a);
 292     static Integer  set1_epi32(int i);                          // return i (all elements are same value)
 293     static Integer  set1_epi8(char i);                          // return i (all elements are same value)
 294     static Float    set1_ps(float f);                           // return f (all elements are same value)
 295     static Float    setzero_ps();                               // return 0 (float)
 296     static Integer  setzero_si();                               // return 0 (integer)
 297     static void     store_ps(float *p, Float a);                // *p = a   (stores all elements contiguously in memory)
 298     static void     store_si(Integer *p, Integer a);            // *p = a
 299     static void     stream_ps(float *p, Float a);               // *p = a   (same as store_ps, but doesn't keep memory in cache)
 300
 301     //=======================================================================
 302     // Legacy interface (available only in SIMD256 width)
 303     //=======================================================================
 304
 305     static Float    broadcast_ps(__m128 const *p);
 306     template<int ImmT>
 307     static __m128d  extractf128_pd(Double a);
 308     template<int ImmT>
 309     static __m128   extractf128_ps(Float a);
 310     template<int ImmT>
 311     static __m128i  extractf128_si(Integer a);
 312     template<int ImmT>
 313     static Double   insertf128_pd(Double a, __m128d b);
 314     template<int ImmT>
 315     static Float    insertf128_ps(Float a, __m128 b);
 316     template<int ImmT>
 317     static Integer  insertf128_si(Integer a, __m128i b);
 318     static Integer  loadu2_si(__m128 const* phi, __m128 const* plo);
 319     template<int ImmT>
 320     static Double   permute2f128_pd(Double a, Double b);
 321     template<int ImmT>
 322     static Float    permute2f128_ps(Float a, Float b);
 323     template<int ImmT>
 324     static Integer  permute2f128_si(Integer a, Integer b);
 325     static Integer  set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
 326     static void     storeu2_si(__m128i *phi, __m128i *plo, Integer src);
 327
 328     //=======================================================================
 329     // Advanced masking interface (currently available only in SIMD16 width)
 330     //=======================================================================
 331
 332
 333     //=======================================================================
 334     // Extended Utility Functions (common to SIMD256 and SIMD16)
 335     //=======================================================================
 336
 337     //-----------------------------------------------------------------------
 338     // Extended Types
 339     //-----------------------------------------------------------------------
 340
 341     // Vec4, an SOA SIMD set of 4-dimensional vectors
 342     union Vec4
 343     {
 344         Vec4() = default;
 345         Vec4(Float in)
 346         {
 347             s.x = in;
 348             s.y = in;
 349             s.z = in;
 350             s.w = in;
 351         }
 352         Vec4(Float x, Float y, Float z, Float w)
 353         {
 354             s.x = x;
 355             s.y = y;
 356             s.z = z;
 357             s.w = w;
 358         }
 359
 360         Float      v[4];
 361         Integer      vi[4];
 362         struct
 363         {
 364             Float  x;
 365             Float  y;
 366             Float  z;
 367             Float  w;
 368         } s;
 369         Float& operator[] (const int i) { return v[i]; }
 370         Float const & operator[] (const int i) const { return v[i]; }
 371     };
 372
 373     //-----------------------------------------------------------------------
 374     // Extended Functions
 375     //-----------------------------------------------------------------------
 376     static void     vec4_set1_ps(Vec4& r, const float *p);                  // r[0] = set1(p[0]), r[1] = set1(p[1]), ...
 377     static void     vec4_set1_vps(Vec4& r, Float s);                        // r[0] = s, r[1] = s, ...
 378     static Float    vec4_dp3_ps(const Vec4& v0, const Vec4& v1);            // return dp3(v0, v1)
 379     static Float    vec4_dp4_ps(const Vec4& v0, const Vec4& v1);            // return dp4(v0, v1)
 380     static Float    vec4_rcp_length_ps(const Vec4& v);                      // return 1.0f / sqrt(dp4(v, v))
 381     static void     vec4_normalize_ps(Vec4& r, const Vec4& v);              // r = v * rcp_length(v)
 382     static void     vec4_mul_ps(Vec4& r, const Vec4& v, Float s);           // r = v * set1_vps(s)
 383     static void     vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1);   // r = v0 * v1
 384     static void     vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1);   // r = v0 + v1
 385     static void     vec4_min_ps(Vec4& r, const Vec4& v0, Float s);          // r = (v0 < s) ? v0 : s
 386     static void     vec4_max_ps(Vec4& r, const Vec4& v0, Float s);          // r = (v0 > s) ? v0 : s
 387
 388     // Matrix4x4 * Vector4
 389     //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w)
 390     //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w)
 391     //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w)
 392     //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w)
 393     static void mat4x4_vec4_multiply(
 394             Vec4& result,
 395             const float *pMatrix,
 396             const Vec4& v);
 397
 398     // Matrix4x4 * Vector3 - Direction Vector where w = 0.
 399     //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0)
 400     //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0)
 401     //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0)
 402     //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0)
 403     static void mat3x3_vec3_w0_multiply(
 404             Vec4& result,
 405             const float *pMatrix,
 406             const Vec4& v);
 407
 408     // Matrix4x4 * Vector3 - Position vector where w = 1.
 409     //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
 410     //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
 411     //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
 412     //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1)
 413     static void mat4x4_vec3_w1_multiply(
 414             Vec4& result,
 415             const float *pMatrix,
 416             const Vec4& v);
 417
 418     // Matrix4x3 * Vector3 - Position vector where w = 1.
 419     //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
 420     //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
 421     //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
 422     //   result.s.w = 1
 423     static void mat4x3_vec3_w1_multiply(
 424             Vec4& result,
 425             const float *pMatrix,
 426             const Vec4& v);
 427 };
 428 #endif // #if 0