os/graphics/m3g/m3gcore11/src/m3g_math.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 * Copyright (c) 2003 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of the License "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description: Internal math function implementations
    15 *
    16 */
    17 
    18 
    19 /*!
    20  * \internal
    21  * \file
    22  * \brief Internal math function implementations
    23  *
    24  */
    25 
    26 #ifndef M3G_CORE_INCLUDE
    27 #   error included by m3g_core.c; do not compile separately.
    28 #endif
    29 
    30 #include "m3g_defs.h"
    31 #include "m3g_memory.h"
    32 
    33 #if defined(M3G_SOFT_FLOAT)
    34 #include <math.h>
    35 #endif
    36 
    37 /*----------------------------------------------------------------------
    38  * Private types and definitions
    39  *--------------------------------------------------------------------*/
    40 
    41 /* Enumerated common matrix classifications */
    42 #define MC_IDENTITY             0x40100401 // 01000000 00010000 00000100 00000001
    43 #define MC_FRUSTUM              0x30BF0C03 // 00110000 10111111 00001100 00000011
    44 #define MC_PERSPECTIVE          0x30B00C03 // 00110000 10110000 00001100 00000011
    45 #define MC_ORTHO                0x7F300C03 // 01111111 00110000 00001100 00000011
    46 #define MC_PARALLEL             0x70300C03 // 01110000 00110000 00001100 00000011
    47 #define MC_SCALING_ROTATION     0x403F3F3F // 01000000 00111111 00111111 00111111
    48 #define MC_SCALING              0x40300C03 // 01000000 00110000 00001100 00000011
    49 #define MC_TRANSLATION          0x7F100401 // 01111111 00010000 00000100 00000001
    50 #define MC_X_ROTATION           0x403C3C01 // 01000000 00111100 00111100 00000001
    51 #define MC_Y_ROTATION           0x40330433 // 01000000 00110011 00000100 00110011
    52 #define MC_Z_ROTATION           0x40100F0F // 01000000 00010000 00001111 00001111
    53 #define MC_W_UNITY              0x7F3F3F3F // 01111111 00111111 00111111 00111111
    54 #define MC_GENERIC              0xFFFFFFFF
    55 
    56 /* Partial masks for individual matrix components */
    57 #define MC_TRANSLATION_PART     0x3F000000 // 00111111 00000000 00000000 00000000
    58 #define MC_SCALE_PART           0x00300C03 // 00000000 00110000 00001100 00000011
    59 #define MC_SCALE_ROTATION_PART  0x003F3F3F // 00000000 00111111 00111111 00111111
    60 
    61 /* Matrix element classification masks */
    62 #define ELEM_ZERO       0x00
    63 #define ELEM_ONE        0x01
    64 #define ELEM_MINUS_ONE  0x02
    65 #define ELEM_ANY        0x03
    66 
    67 /*!
    68  * \internal
    69  * \brief calculates the offset of a 4x4 matrix element in a linear
    70  * array
    71  *
    72  * \notes The current convention is column-major, as in OpenGL ES
    73  */
    74 #define MELEM(row, col) ((row) + ((col) << 2))
    75 
    76 /*!
    77  * \internal
    78  * \brief Macro for accessing 4x4 float matrix elements
    79  *
    80  * \param mtx pointer to the first element of the matrix
    81  * \param row matrix row
    82  * \param col matrix column
    83  */
    84 #define M44F(mtx, row, col)     ((mtx)->elem[MELEM((row), (col))])
    85 
    86 /*--------------------------------------------------------------------*/
    87 
    88 /*----------------------------------------------------------------------
    89  * Private functions
    90  *--------------------------------------------------------------------*/
    91 
    92 
    93 /*!
    94  * \internal
    95  * \brief ARM VFPv2 implementation of 4x4 matrix multiplication.
    96  *
    97  * \param dst	multiplication result
    98  * \param left	left-hand matrix
    99  * \param right right-hand matrix
   100  */
   101 #if defined(M3G_HW_FLOAT_VFPV2)
   102 
   103 __asm void _m3gGenericMatrixProduct(Matrix *dst,
   104                                     const Matrix *left, const Matrix *right)
   105 {
   106 // r0 = *dst
   107 // r1 = *left
   108 // r2 = *right
   109 
   110 		CODE32
   111 
   112 // save the VFP registers and set the vector STRIDE = 1, LENGTH = 4
   113 
   114 		FSTMFDD	sp!, {d8-d15}
   115 
   116 		FMRX	r3, FPSCR		 
   117 		BIC		r12, r3, #0x00370000
   118 		ORR		r12, #0x00030000
   119 		FMXR	FPSCR, r12
   120 
   121 // left = [a0  a1  a2  a3	right = [b0  b1  b2  b3
   122 //	 	   a4  a5  a6  a7		     b4  b5  b6  b7
   123 //		   a8  a9 a10 a11		     b8  b9 b10 b11
   124 //		  a12 a13 a14 a15]		    b12 b13 b14 b15]
   125 //
   126 // dst = [a0*b0+a4*b1+a8*b2+a12*b3  a1*b0+a5*b1+a9*b2+a13*b3  ..
   127 //		  a0*b4+a4*b5+a8*b6+a12*b7  a1*b4+a5*b5+a9*b6+a13*b7  ..
   128 //					.							.
   129 //					.							.
   130 
   131 		FLDMIAS	r1!, {s8-s23}		// load the left matrix [a0-a15] to registers s8-s23
   132 		FLDMIAS	r2!, {s0-s7}		// load [b0-b7] of right matrix to registers s0-s7
   133 		FMULS	s24, s8, s0			// [s24-s27]  = [a0*b0  a1*b0  a2*b0  a3*b0]
   134 		FMULS	s28, s8, s4			// [s28-s31]  = [a0*b4  a1*b4  a2*b4  a3*b4]
   135 		FMACS	s24, s12, s1		// [s24-s27] += [a4*b1  a5*b1  a6*b1  a7*b1]
   136 		FMACS	s28, s12, s5		// [s28-s31] += [a4*b5  a5*b5  a6*b5  a7*b5]
   137 		FMACS	s24, s16, s2		// [s24-s27] += [a8*b2  a9*b2  a10*b2 a11*b2]
   138 		FMACS	s28, s16, s6		// [s28-s31] += [a8*b6  a9*b6  a10*b6 a11*b6]
   139 		FMACS	s24, s20, s3		// [s24-s27] += [a12*b3 a13*b3 a14*b3 a15*b3]
   140 		FMACS	s28, s20, s7		// [s28-s31] += [a12*b7 a13*b37a14*b7 a15*b7]
   141 		FLDMIAS	r2!, {s0-s7}		// load [b8-b15]
   142 		FSTMIAS	r0!, {s24-s31}		// write [dst0-dst7]
   143 		FMULS	s24, s8, s0			
   144 		FMULS	s28, s8, s4			
   145 		FMACS	s24, s12, s1		
   146 		FMACS	s28, s12, s5		
   147 		FMACS	s24, s16, s2		
   148 		FMACS	s28, s16, s6		
   149 		FMACS	s24, s20, s3		
   150 		FMACS	s28, s20, s7		
   151 		FSTMIAS	r0!, {s24-s31}
   152 
   153 // Restore the VFP registers and return.
   154 
   155 		FMXR	FPSCR, r3
   156 
   157 		FLDMFDD	sp!, {d8-d15}	
   158 
   159 		BX		lr
   160 
   161 }
   162 #endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
   163 
   164 
   165 /*------------------ Elementary float ------------------*/
   166 
   167 #if defined(M3G_SOFT_FLOAT)
   168 
   169 #if defined (M3G_BUILD_ARM)
   170 
   171 /*!
   172  * \internal
   173  * \brief Floating point multiplication implementation for ARM.
   174  */
   175 __asm M3Gfloat m3gMul(const M3Gfloat a,
   176                       const M3Gfloat b)
   177 {
   178     /**
   179      * Extract the exponents of the multiplicands and add them
   180      * together. Flush to zero if either exponent or their sum
   181      * is zero.
   182      */
   183 
   184     mov     r12, #0xff;
   185     ands    r2, r0, r12, lsl #23;   // exit if e1 == 0
   186     andnes  r3, r1, r12, lsl #23;   // exit if e2 == 0
   187     subne   r2, r2, #(127 << 23);
   188     addnes  r12, r2, r3;            // exit if e1+e2-127 <= 0
   189     movle   r0, #0;
   190     bxle    lr;
   191 
   192     /**
   193      * Determine the sign of the result. Note that the exponent
   194      * may have overflowed to the sign bit, and thus the result
   195      * may be an arbitrary negative value when it really should
   196      * be +Inf or -Inf.
   197      */
   198 
   199     teq     r0, r1;
   200     orrmi   r12, r12, #0x80000000;
   201 
   202     /**
   203      * Multiply the mantissas. First shift the mantissas up to
   204      * unsigned 1.31 fixed point, adding the leading "1" bit at
   205      * the same time, and finally do a 32x32 -> 64 bit unsigned
   206      * multiplication. The result is in unsigned 2.62 fixed point,
   207      * representing the interval [1.0, 4.0).
   208      */
   209 
   210     mov     r2, #0x80000000;
   211     orr     r0, r2, r0, lsl #8;
   212     orr     r1, r2, r1, lsl #8;
   213     umulls  r2, r3, r0, r1;
   214 
   215     /**
   216      * If the highest bit of the 64-bit result is set, then the
   217      * mantissa lies in [2.0, 4.0) and needs to be renormalized.
   218      * That is, the mantissa is shifted one bit to the right and
   219      * the exponent correspondingly increased by 1. Note that we
   220      * lose the leading "1" bit from the mantissa by adding it up
   221      * with the exponent.
   222      */
   223 
   224     subpl   r12, r12, #(1 << 23);    // no overflow: exponent -= 1
   225     addpl   r0, r12, r3, lsr #7;     // no overflow: exponent += 1
   226     addmi   r0, r12, r3, lsr #8;     // overflow: exponent += 1
   227     bx      lr;
   228 }
   229 
   230 /*!
   231  * \internal
   232  * \brief Floating point addition implementation for ARM.
   233  */
   234 __asm M3Gfloat m3gAdd(const M3Gfloat a,
   235                       const M3Gfloat b)
   236 {
   237     /**
   238      * If the operands have opposite signs then this is not really
   239      * an addition but a subtraction. Subtraction is much slower,
   240      * so we have a separate code path for it, rather than trying
   241      * to save space by handling both in the same place.
   242      */
   243     
   244     teq     r0, r1;
   245     bmi     _m3gSub;
   246 
   247     /**
   248      * Sort the operands such that the larger operand is in R0 and
   249      * the smaller in R1. The sign bits do not affect the ordering,
   250      * since they are known to be equal.
   251      */
   252     
   253     subs    r2, r0, r1;
   254     submi   r0, r0, r2;
   255     addmi   r1, r1, r2;
   256 
   257     /**
   258      * Extract the exponent of the smaller operand into R2 and compute
   259      * the difference between the larger and smaller exponent into R3.
   260      * (Note that the sign bits cancel out in the subtraction.) The
   261      * exponent delta tells how many bits the mantissa of the smaller
   262      * operand must be shifted to the right in order to bring the
   263      * operands into equal scale.
   264      */
   265 
   266     mov     r2, r1, lsr #23;
   267     rsb     r3, r2, r0, lsr #23;
   268 
   269     /**
   270      * Check if the exponent delta is bigger than 23 bits, or if the
   271      * smaller exponent is zero. In either case, exit the routine and
   272      * return the larger operand (which is already in R0). Note that
   273      * this means that subnormals are treated as zero.
   274      */
   275 
   276     rsbs    r12, r3, #23;               // N set, V clear if R3 > 23
   277     tstpl   r2, #0xff;                  // execute only if R3 <= 23
   278     bxle    lr;                         // exit if Z set or N != V
   279 
   280     /**
   281      * Extract the mantissas and shift them up to unsigned 1.31 fixed
   282      * point, inserting the implied leading "1" bit at the same time.
   283      * Finally, align the decimal points and add up the mantissas.
   284      */
   285     
   286     mov     r12, #0x80000000;
   287     orr     r0, r12, r0, lsl #8;
   288     orr     r1, r12, r1, lsl #8;
   289     adds    r0, r0, r1, lsr r3;
   290 
   291     /**
   292      * Compute the final exponent by adding up the smaller exponent
   293      * (R2), the exponent delta (R3), and the possible overflow bit
   294      * (carry flag). Note that in case of overflow, the leading "1"
   295      * has ended up in the carry flag and thus needs not be explicitly
   296      * discarded. Finally, put the mantissa together with the sign and
   297      * exponent.
   298      */
   299 
   300     adc     r2, r2, r3;                 // r2 = smallExp + deltaExp + overflow
   301     movcc   r0, r0, lsl #1;             // no overflow: discard leading 1
   302     mov     r0, r0, lsr #9;
   303     orr     r0, r0, r2, lsl #23;
   304     bx      lr;
   305     
   306 _m3gSub
   307 
   308     /**
   309      * Sort the operands such that the one with larger magnitude is
   310      * in R0 and has the correct sign (the sign of the final result),
   311      * while the smaller operand is in R1 with an inverted sign bit.
   312      */
   313     
   314     eor     r1, r1, #0x80000000;
   315     subs    r2, r0, r1;
   316     eormi   r2, r2, #0x80000000;
   317     submi   r0, r0, r2;
   318     addmi   r1, r1, r2;
   319 
   320     /**
   321      * Extract the exponent of the smaller operand into R2 and compute
   322      * the difference between the larger and smaller exponent into R3.
   323      * (Note that the sign bits cancel out in the subtraction.) The
   324      * exponent delta tells how many bits the mantissa of the smaller
   325      * operand must be shifted to the right in order to bring the
   326      * operands into equal scale.
   327      */
   328 
   329     mov     r2, r1, lsr #23;
   330     rsbs    r3, r2, r0, lsr #23;
   331 
   332     /**
   333      * Check if the exponent delta is bigger than 31 bits, or if the
   334      * smaller exponent is zero. In either case, exit the routine and
   335      * return the larger operand (which is already in R0). Note that
   336      * this means that subnormals are treated as zero.
   337      */
   338 
   339     rsbs    r12, r3, #31;               // N set, V clear if R3 > 31
   340     tstpl   r2, #0xff;                  // execute only if R3 <= 31
   341     bxle    lr;                         // exit if Z set or N != V
   342 
   343     /**
   344      * Extract the mantissas and shift them up to unsigned 1.31 fixed
   345      * point, inserting the implied leading "1" bit at the same time.
   346      * Then align the decimal points and subtract the smaller mantissa
   347      * from the larger one.
   348      */
   349     
   350     mov     r12, #0x80000000;
   351     orr     r0, r12, r0, lsl #8;
   352     orr     r1, r12, r1, lsl #8;
   353     subs    r0, r0, r1, lsr r3;
   354 
   355     /**
   356      * We split the range of possible results into three categories:
   357      * 
   358      *   1. [1.0, 2.0) ==> no renormalization needed (highest bit set)
   359      *   2. [0.5, 1.0) ==> only one left-shift needed
   360      *   3. (0.0, 0.5) ==> multiple left-shifts needed
   361      *   4. zero       ==> just return
   362      *   
   363      * Cases 1 and 2 are handled in the main code path. Cases 3 and 4
   364      * are less common by far, so we branch to a separate code fragment
   365      * for those.
   366      */
   367     
   368     movpls  r0, r0, lsl #1;             // Cases 2,3,4: shift left
   369     bpl     _m3gSubRenormalize;         // Cases 3 & 4: branch out
   370     
   371     /**
   372      * Now we have normalized the mantissa such that the highest bit
   373      * is set. Here we only need to adjust the exponent, if necessary,
   374      * and put the pieces together. Note that we lose the leading "1"
   375      * bit from the mantissa by adding it up with the exponent. We can
   376      * also do proper rounding (towards nearest) instead of truncation
   377      * (towards zero) at no extra cost!
   378      */
   379 
   380     sbc     r3, r3, #1;                 // deltaExp -= 1 (Case 1) or 2 (Case 2)
   381     add     r2, r2, r3;                 // resultExp = smallExp + deltaExp
   382     movs    r0, r0, lsr #8;             // shift mantissa, keep leading "1"
   383     adc     r0, r0, r2, lsl #23;        // resultExp += 1, mantissa += carry
   384     bx      lr;
   385 
   386     /**
   387      * Separate code path for cases 3 and 4 (see above). The mantissa
   388      * has already been shifted up by one, but the exponent has not
   389      * been correspondingly decreased. We also know that the highest
   390      * bit is still not set, and that the carry flag is clear.
   391      */
   392 
   393 _m3gSubRenormalize
   394     bxeq    lr;
   395     subcc   r3, r3, #2;
   396     movccs  r0, r0, lsl #2;
   397 
   398     /**
   399      * If the carry flag is still not set, i.e. there were more than
   400      * two leading zeros in the mantissa initially, loop until we
   401      * find the highest set bit.
   402      */
   403     
   404 _m3gSubRenormalizeLoop
   405     subcc   r3, r3, #1;
   406     movccs  r0, r0, lsl #1;
   407     bcc     _m3gSubRenormalizeLoop;
   408 
   409     /**
   410      * Now the leading "1" is in the carry flag, so we can just add up
   411      * the exponent and mantissa as usual, doing proper rounding at
   412      * the same time. However, cases where the exponent goes negative
   413      * (that is, underflows) must be detected and flushed to zero.
   414      */
   415     
   416     add     r3, r2, r3;
   417     movs    r0, r0, lsr #9;
   418     adc     r0, r0, r3, lsl #23;
   419     teq     r0, r2, lsl #23;
   420     movmi   r0, #0;
   421     bx      lr;
   422 }
   423 
   424 #else /* M3G_BUILD_ARM */
   425 
   426 /*!
   427  * \internal
   428  * \brief Floating point addition implementation
   429  *
   430  */
   431 static M3G_INLINE M3Gfloat m3gFloatAdd(const M3Guint aBits,
   432                                        const M3Guint bBits)
   433 {
   434     M3Guint large, small, signMask;
   435     
   436     /* Early exits for underflow cases */
   437 
   438     large = (M3Gint)(aBits & ~SIGN_MASK);
   439     if (large <= 0x00800000) {
   440         return INT_AS_FLOAT(bBits);
   441     }
   442     small = (M3Gint)(bBits & ~SIGN_MASK);
   443     if (small <= 0x00800000) {
   444         return INT_AS_FLOAT(aBits);
   445     }
   446 
   447     /* Swap the numbers so that "large" really is larger; the unsigned
   448      * (or de-signed) bitmasks for floats are nicely monotonous, so we
   449      * can compare directly.  Also store the sign of the larger number
   450      * for future reference. */
   451 
   452     if (small > large) {
   453         M3Gint temp = small;
   454         small = large;
   455         large = temp;
   456         signMask = (bBits & SIGN_MASK);
   457     }
   458     else {
   459         signMask = (aBits & SIGN_MASK);
   460     }
   461     
   462     {
   463         M3Guint res, overflow;
   464         M3Guint resExp, expDelta;
   465         
   466         /* Store the larger exponent as our candidate result exponent,
   467          * and compute the difference between the exponents */
   468 
   469         resExp = (large >> 23);
   470         expDelta = resExp - (small >> 23);
   471 
   472         /* Take an early exit if the change would be insignificant;
   473          * this also guards against odd results from shifting by more
   474          * than 31 (undefined in C) */
   475 
   476         if (expDelta >= 24) {
   477             res = large | signMask;
   478             return INT_AS_FLOAT(res);
   479         }
   480 
   481         /* Convert the mantissas into shifted integers, and shift the
   482          * smaller number to the same scale with the larger one. */
   483 
   484         large = (large & MANTISSA_MASK) | LEADING_ONE;
   485         small = (small & MANTISSA_MASK) | LEADING_ONE;
   486         small >>= expDelta;
   487         M3G_ASSERT(large >= small);
   488         
   489         /* Check whether we're really adding or subtracting the
   490          * smaller number, and branch to slightly different routines
   491          * respectively */
   492 
   493         if (((aBits ^ bBits) & SIGN_MASK) == 0) {
   494 
   495             /* Matching signs; just add the numbers and check for
   496              * overflow, shifting to compensate as necessary. */
   497 
   498             res = large + small;
   499             
   500             overflow = (res >> 24);
   501             res >>= overflow;
   502             resExp += overflow;
   503         }
   504         else {
   505 
   506             /* Different signs, so let's subtract the smaller value;
   507              * also check that we're not subtracting a number from
   508              * itself (so we don't have to normalize a zero below) */
   509             
   510             if (small == large) {
   511                 return 0.0f; /* x - x = 0 */
   512             }
   513 
   514             res = (large << 8) - (small << 8);
   515 
   516             /* Renormalize the number by shifting until we've got the
   517              * high bit in place */
   518 
   519             while ((res >> 24) == 0) {
   520                 res <<= 8;
   521                 resExp -= 8;
   522             }
   523             while ((res >> 31) == 0) {
   524                 res <<= 1;
   525                 --resExp;
   526             }
   527             res >>= 8;
   528         }
   529 
   530         /* Flush to zero in case of over/underflow of the exponent */
   531 
   532         if (resExp >= 255) {
   533             return 0.0f;
   534         }
   535         
   536         /* Compose the final number into "res"; note that we pull in
   537          * the sign of the original larger number, which should still
   538          * be valid */
   539 
   540         res &= MANTISSA_MASK;
   541         res |= (resExp << 23);
   542         res |= signMask;
   543 
   544         return INT_AS_FLOAT(res);
   545     }
   546 }
   547 
   548 /*!
   549  * \internal
   550  * \brief Floating point multiplication implementation
   551  */
   552 static M3G_INLINE M3Gfloat m3gFloatMul(const M3Guint aBits,
   553                                        const M3Guint bBits)
   554 {
   555     M3Guint a, b;
   556 
   557     /* Early exits for underflow and multiplication by zero */
   558 
   559     a = (aBits & ~SIGN_MASK);
   560     if (a <= 0x00800000) {
   561         return 0.0f;
   562     }
   563     
   564     b = (bBits & ~SIGN_MASK);
   565     if (b <= 0x00800000) {
   566         return 0.0f;
   567     }
   568     
   569     {
   570         M3Guint res, exponent, overflow;
   571         
   572         /* Compute the exponent of the result, assuming the mantissas
   573          * don't overflow; then mask out the original exponents */
   574         
   575         exponent = (a >> 23) + (b >> 23) - 127;
   576         a &= MANTISSA_MASK;
   577         b &= MANTISSA_MASK;
   578 
   579         /* Compute the new mantissa from:
   580          *
   581          *   (1 + a)(1 + b) = ab + a + b + 1
   582          *   
   583          * First shift the mantissas from 0.23 down to 0.16 for the
   584          * multiplication, then shift back to 0.23 for adding in the
   585          * "a + b + 1" part of the equation.  */
   586         
   587         res = (a >> 7) * (b >> 7);              /* "ab" at 0.32 */
   588         res = (res >> 9) + a + b + LEADING_ONE;
   589 
   590         /* Add the leading one, then normalize the result by checking
   591          * the overflow bit and dividing by two if necessary */
   592 
   593         overflow = (res >> 24);
   594         res >>= overflow;
   595         exponent += overflow;
   596 
   597         /* Flush to zero in case of over/underflow of the exponent */
   598 
   599         if (exponent >= 255) {
   600             return 0.0f;
   601         }
   602 
   603         /* Compose the final number into "res" */
   604 
   605         res &= MANTISSA_MASK;
   606         res |= (exponent << 23);
   607         res |= (aBits ^ bBits) & SIGN_MASK;
   608 
   609         return INT_AS_FLOAT(res);
   610     }
   611 }
   612 
   613 #endif /* M3G_BUILD_ARM */
   614 
   615 /*!
   616  * \internal
   617  * \brief Computes the signed fractional part of a floating point
   618  * number
   619  *
   620  * \param x  floating point value
   621  * \return x signed fraction of x in ]-1, 1[
   622  */
   623 static M3Gfloat m3gFrac(M3Gfloat x)
   624 {
   625     /* Quick exit for -1 < x < 1 */
   626     
   627     if (m3gAbs(x) < 1.0f) {
   628         return x;
   629     }
   630 
   631     /* Shift the mantissa to the proper place, mask out the integer
   632      * part, and renormalize */
   633     {
   634         M3Guint ix = FLOAT_AS_UINT(x);
   635         M3Gint expo = ((ix >> 23) & 0xFF) - 127;
   636         M3G_ASSERT(expo >= 0);
   637 
   638         /* The decimal part will always be zero for large values with
   639          * exponents over 24 */
   640         
   641         if (expo >= 24) {
   642             return 0.f;
   643         }
   644         else {
   645             
   646             /* Shift the integer part out of the mantissa and see what
   647              * we have left */
   648             
   649             M3Guint base = (ix & MANTISSA_MASK) | LEADING_ONE;
   650             base = (base << expo) & MANTISSA_MASK;
   651 
   652             /* Quick exit (and guard against infinite looping) for
   653              * zero */
   654 
   655             if (base == 0) {
   656                 return 0.f;
   657             }
   658 
   659             /* We now have an exponent of 0 (i.e. no shifting), but
   660              * must renormalize to get a set bit in place of the
   661              * hidden (implicit one) bit */
   662             
   663             expo = 0;
   664             
   665             while ((base >> 19) == 0) {
   666                 base <<= 4;
   667                 expo -= 4;
   668             }
   669             while ((base >> 23) == 0) {
   670                 base <<= 1;
   671                 --expo;
   672             }
   673 
   674             /* Compose the final number */
   675 
   676             ix =
   677                 (base & MANTISSA_MASK) |
   678                 ((expo + 127) << 23) |
   679                 (ix & SIGN_MASK);
   680             return INT_AS_FLOAT(ix);
   681         }
   682     }
   683 }
   684 
   685 #endif /* M3G_SOFT_FLOAT */
   686 
   687 #if defined(M3G_DEBUG)
   688 /*!
   689  * \internal
   690  * \brief Checks for NaN or infinity in a floating point input
   691  */
   692 static void m3gValidateFloats(int n, float *p)
   693 {
   694     while (n-- > 0) {
   695         M3G_ASSERT(EXPONENT(*p) < 120);
   696         ++p;
   697     }
   698 }
   699 #else
   700 #   define m3gValidateFloats(n, p)
   701 #endif
   702     
   703 /*------------------ Trigonometry and exp ----------*/
   704 
   705 
   706 #if defined(M3G_SOFT_FLOAT)
   707 /*!
   708  * \internal
   709  * \brief Sine for the first quadrant
   710  *
   711  * \param x floating point value in [0, PI/2]
   712  * \return sine of \c x
   713  */
   714 static M3Gfloat m3gSinFirstQuadrant(M3Gfloat x)
   715 {
   716     M3Guint bits = FLOAT_AS_UINT(x);
   717     
   718     if (bits <= 0x3ba3d70au)    /* 0.005f */
   719         return x;
   720     else {
   721         static const M3Gfloat sinTermLut[4] = {
   722             -1.0f / (2*3),
   723             -1.0f / (4*5),
   724             -1.0f / (6*7),
   725             -1.0f / (8*9)
   726         };
   727 
   728         M3Gfloat xx = m3gSquare(x);
   729         M3Gfloat sinx = x;
   730         int i;
   731 
   732         for (i = 0; i < 4; ++i) {
   733             x    = m3gMul(x, m3gMul(xx, sinTermLut[i]));
   734             sinx = m3gAdd(sinx, x);
   735         }
   736 
   737         return sinx;
   738     }
   739 }
   740 #endif /* M3G_SOFT_FLOAT */
   741 
   742 #if defined(M3G_SOFT_FLOAT)
   743 /*!
   744  * \internal
   745  * \brief Computes sine for the first period
   746  *
   747  * \param x floating point value in [0, 2PI]
   748  * \return sine of x
   749  */
   750 static M3Gfloat m3gSinFirstPeriod(const M3Gfloat x)
   751 {
   752     M3G_ASSERT(x >= 0 && x <= TWO_PI);
   753     
   754     if (x >= PI) {
   755         return m3gNegate(m3gSinFirstQuadrant(x >= ONE_AND_HALF_PI ?
   756                                              m3gSub(TWO_PI, x) :
   757                                              m3gSub(x, PI)));
   758     }
   759     return m3gSinFirstQuadrant((x >= HALF_PI) ? m3gSub(PI, x) : x);
   760 }
   761 #endif /* M3G_SOFT_FLOAT */
   762 
   763 /*------------- Float vs. int conversion helpers -------------*/
   764 
   765 /*!
   766  * \internal
   767  * \brief Scales and clamps a float to unsigned byte range
   768  */
   769 static M3G_INLINE M3Gint m3gFloatToByte(const M3Gfloat a)
   770 {
   771     return m3gRoundToInt(m3gMul(255.f, m3gClampFloat(a, 0.f, 1.f)));
   772 }
   773 
   774 /*------------------ Vector helpers ------------------*/
   775 
   776 /*!
   777  * \internal
   778  * \brief Computes the norm of a floating-point 3-vector
   779  */
   780 static M3G_INLINE M3Gfloat m3gNorm3(const M3Gfloat *v)
   781 {
   782     return m3gAdd(m3gAdd(m3gSquare(v[0]), m3gSquare(v[1])),
   783                   m3gSquare(v[2]));
   784 }
   785 
   786 /*!
   787  * \internal
   788  * \brief Computes the norm of a floating-point 4-vector
   789  */
   790 static M3G_INLINE M3Gfloat m3gNorm4(const M3Gfloat *v)
   791 {
   792     return m3gAdd(m3gAdd(m3gSquare(v[0]), m3gSquare(v[1])),
   793                   m3gAdd(m3gSquare(v[2]), m3gSquare(v[3])));
   794 }
   795 
   796 /*!
   797  * \internal
   798  * \brief Scales a floating-point 3-vector
   799  */
   800 static M3G_INLINE void m3gScale3(M3Gfloat *v, M3Gfloat s)
   801 {
   802     v[0] = m3gMul(v[0], s);
   803     v[1] = m3gMul(v[1], s);
   804     v[2] = m3gMul(v[2], s);
   805 }
   806 
   807 /*!
   808  * \internal
   809  * \brief Scales a floating-point 4-vector
   810  */
   811 static M3G_INLINE void m3gScale4(M3Gfloat *v, M3Gfloat s)
   812 {
   813     v[0] = m3gMul(v[0], s);
   814     v[1] = m3gMul(v[1], s);
   815     v[2] = m3gMul(v[2], s);
   816     v[3] = m3gMul(v[3], s);
   817 }
   818 
   819 
   820 /*------------------ Matrices ------------------*/
   821 
   822 /*!
   823  * \internal
   824  */
   825 static M3G_INLINE M3Gbool m3gIsClassified(const Matrix *mtx)
   826 {
   827     M3G_ASSERT(mtx != NULL);
   828     return (M3Gbool) mtx->classified;
   829 }
   830 
   831 /*!
   832  * \internal
   833  * \brief Returns a classification for a single floating point number
   834  */
   835 static M3G_INLINE M3Guint m3gElementClass(const M3Gfloat x)
   836 {
   837     if (IS_ZERO(x)) {
   838         return ELEM_ZERO;
   839     }
   840     else if (IS_ONE(x)) {
   841         return ELEM_ONE;
   842     }
   843     else if (IS_MINUS_ONE(x)) {
   844         return ELEM_MINUS_ONE;
   845     }
   846     return ELEM_ANY;
   847 }
   848 
   849 /*!
   850  * \internal
   851  * \brief Computes the classification mask of a matrix
   852  *
   853  * The mask is constructed from two bits per elements, with the lowest
   854  * two bits corresponding to the first element in the \c elem array of
   855  * the matrix.
   856  */
   857 static void m3gClassify(Matrix *mtx)
   858 {
   859     M3Guint mask = 0;
   860     const M3Gfloat *p;
   861     int i;
   862 
   863     M3G_ASSERT(mtx != NULL);
   864     M3G_ASSERT(!m3gIsClassified(mtx));
   865 
   866     p = mtx->elem;
   867     for (i = 0; i < 16; ++i) {
   868         M3Gfloat elem = *p++;
   869         mask |= (m3gElementClass(elem) << (i*2));
   870     }
   871     mtx->mask = mask;
   872     mtx->classified = M3G_TRUE;
   873 }
   874 
   875 /*!
   876  * \internal
   877  * \brief Sets matrix classification directly
   878  */
   879 static M3G_INLINE void m3gClassifyAs(M3Guint mask, Matrix *mtx)
   880 {
   881     M3G_ASSERT(mtx != NULL);
   882     mtx->mask = mask;
   883     mtx->classified = M3G_TRUE;
   884     mtx->complete = M3G_FALSE;
   885 }
   886 
   887 /*!
   888  * \internal
   889  * \brief Attempts to classify a matrix more precisely
   890  *
   891  * Tries to classify all "free" elements of a matrix into one of the
   892  * predefined constants to improve precision and performance in
   893  * subsequent calculations.
   894  */
   895 static void m3gSubClassify(Matrix *mtx)
   896 {
   897     M3G_ASSERT_PTR(mtx);
   898     M3G_ASSERT(m3gIsClassified(mtx));
   899     {
   900         const M3Gfloat *p = mtx->elem;
   901         M3Guint inMask = mtx->mask;
   902         M3Guint outMask = inMask;
   903         int i;
   904 
   905         for (i = 0; i < 16; ++i, inMask >>= 2) {
   906             M3Gfloat elem = *p++;
   907             if ((inMask & 0x03) == ELEM_ANY) {
   908                 outMask &= ~(0x03u << (i*2));
   909                 outMask |= (m3gElementClass(elem) << (i*2));
   910             }
   911         }
   912         mtx->mask = outMask;
   913     }
   914 }
   915 
   916 /*!
   917  * \internal
   918  * \brief Fills in the implicit values for a classified matrix
   919  */
   920 static void m3gFillClassifiedMatrix(Matrix *mtx)
   921 {
   922     int i;
   923     M3Guint mask;
   924     M3Gfloat *p;
   925 
   926     M3G_ASSERT(mtx != NULL);
   927     M3G_ASSERT(mtx->classified);
   928     M3G_ASSERT(!mtx->complete);
   929 
   930     mask = mtx->mask;
   931     p = mtx->elem;
   932 
   933     for (i = 0; i < 16; ++i, mask >>= 2) {
   934         unsigned elem = (mask & 0x03);
   935         switch (elem) {
   936         case ELEM_ZERO:         *p++ =  0.0f; break;
   937         case ELEM_ONE:          *p++ =  1.0f; break;
   938         case ELEM_MINUS_ONE:    *p++ = -1.0f; break;
   939         default:                ++p;
   940         }
   941     }
   942     mtx->complete = M3G_TRUE;
   943 }
   944 
   945 
   946 #if !defined(M3G_HW_FLOAT)
   947 /*!
   948  * \internal
   949  * \brief Performs one multiply-add of classified matrix elements
   950  *
   951  * \param amask element class of the first multiplicand
   952  * \param a     float value of the first multiplicand
   953  * \param bmask element class of the second multiplicand
   954  * \param b     float value of the second multiplicand
   955  * \param c     float value to add
   956  * \return a * b + c
   957  * 
   958  * \notes inline, as only called from the matrix product function
   959  */
   960 static M3G_INLINE M3Gfloat m3gClassifiedMadd(
   961     const M3Gbitmask amask, const M3Gfloat *pa,
   962     const M3Gbitmask bmask, const M3Gfloat *pb,
   963     const M3Gfloat c)
   964 {
   965     /* Check for zero product to reduce the switch cases below */
   966     
   967     if (amask == ELEM_ZERO || bmask == ELEM_ZERO) {
   968         return c;    
   969     }
   970 
   971     /* Branch based on the classification of a */
   972     
   973     switch (amask) {
   974         
   975     case ELEM_ANY:
   976         if (bmask == ELEM_ONE) {
   977             return m3gAdd(*pa, c);      /*  a * 1 + c  =  a + c  */
   978         }
   979         if (bmask == ELEM_MINUS_ONE) {
   980             return m3gSub(c, *pa);      /*  a * -1 + c = -a + c  */
   981         }
   982         return m3gMadd(*pa, *pb, c);    /*  a * b + c            */
   983         
   984     case ELEM_ONE:
   985         if (bmask == ELEM_ONE) {
   986             return m3gAdd(c, 1.f);      /*  1 * 1 + c  = 1 + c   */
   987         }
   988         if (bmask == ELEM_MINUS_ONE) {
   989             return m3gSub(c, 1.f);      /*  1 * -1 + c = -1 + c  */
   990         }
   991         return m3gAdd(*pb, c);          /*  1 * b + c  =  b + c  */
   992         
   993     case ELEM_MINUS_ONE:
   994         if (bmask == ELEM_ONE) {
   995             return m3gSub(c, 1.f);      /* -1 * 1 + c  = -1 + c  */
   996         }
   997         if (bmask == ELEM_MINUS_ONE) {
   998             return m3gAdd(c, 1.f);      /* -1 * -1 + c =  1 + c  */
   999         }
  1000         return m3gSub(c, *pb);          /* -1 * b + c  = -b + c  */
  1001         
  1002     default:
  1003         M3G_ASSERT(M3G_FALSE);
  1004         return 0.0f;
  1005     }
  1006 }
  1007 #endif /*!defined(M3G_HW_FLOAT)*/
  1008 
  1009 /*!
  1010  * \internal
  1011  * \brief Computes a generic 4x4 matrix product
  1012  */
  1013 static void m3gGenericMatrixProduct(Matrix *dst,
  1014                                     const Matrix *left, const Matrix *right)
  1015 {
  1016     M3G_ASSERT(dst != NULL && left != NULL && right != NULL);
  1017 
  1018     {
  1019 #       if defined(M3G_HW_FLOAT)
  1020         if (!left->complete) {
  1021             m3gFillClassifiedMatrix((Matrix*)left);
  1022         }
  1023         if (!right->complete) {
  1024             m3gFillClassifiedMatrix((Matrix*)right);
  1025         }
  1026 #       else
  1027         const unsigned lmask = left->mask;
  1028         const unsigned rmask = right->mask;
  1029 #       endif
  1030         
  1031 #if defined(M3G_HW_FLOAT_VFPV2)
  1032 		_m3gGenericMatrixProduct(dst, left, right);
  1033 #else	
  1034         {
  1035             int row;
  1036 
  1037             for (row = 0; row < 4; ++row) {
  1038                 int col;
  1039                 for (col = 0; col < 4; ++col) {
  1040                     int k;
  1041                     M3Gfloat a = 0;
  1042                     for (k = 0; k < 4; ++k) {
  1043                         M3Gint lidx = MELEM(row, k);
  1044                         M3Gint ridx = MELEM(k, col);
  1045 #                       if defined(M3G_HW_FLOAT)
  1046                         a = m3gMadd(left->elem[lidx], right->elem[ridx], a);
  1047 #                       else
  1048                         a = m3gClassifiedMadd((lmask >> (2 * lidx)) & 3,
  1049                                               &left->elem[lidx],
  1050                                               (rmask >> (2 * ridx)) & 3,
  1051                                               &right->elem[ridx],
  1052                                               a);
  1053 #                       endif /*!M3G_HW_FLOAT*/
  1054                     }
  1055                     M44F(dst, row, col) = a;
  1056                 }
  1057             }
  1058         }
  1059 #endif /*!M3G_HW_FLOAT_VFPV2*/
  1060     }
  1061     dst->complete = M3G_TRUE;
  1062     dst->classified = M3G_FALSE;
  1063 }
  1064 
  1065 /*!
  1066  * \internal
  1067  * \brief Converts a float vector to 16-bit integers
  1068  *
  1069  * \param size   vector length
  1070  * \param vec    pointer to float vector
  1071  * \param scale  scale of output values, as the number of bits to
  1072  *               shift left to get actual values
  1073  * \param outVec output value vector
  1074  */
  1075 static void m3gFloatVecToShort(M3Gint size, const M3Gfloat *vec,
  1076                                M3Gint scale, M3Gshort *outVec)
  1077 {
  1078     const M3Guint *vecInt = (const M3Guint*) vec;
  1079     M3Gint i;
  1080     
  1081     for (i = 0; i < size; ++i) {
  1082         M3Guint a = vecInt[i];
  1083         if ((a & ~SIGN_MASK) < (1 << 23)) {
  1084             *outVec++ = 0;
  1085         }
  1086         else {
  1087             M3Gint shift =
  1088                 scale - ((M3Gint)((vecInt[i] >> 23) & 0xFFu) - (127 + 23));
  1089             M3G_ASSERT(shift > 8); /* or the high bits will overflow */
  1090             
  1091             if (shift > 23) {
  1092                 *outVec++ = 0;
  1093             }
  1094             else {
  1095                 M3Gint out =
  1096                     (M3Gint) (((a & MANTISSA_MASK) | LEADING_ONE) >> shift);
  1097                 if (a >> 31) {
  1098                     out = -out;
  1099                 }
  1100                 M3G_ASSERT(m3gInRange(out, -32767, 32767));
  1101                 *outVec++ = (M3Gshort) out;
  1102             }
  1103         }
  1104     }
  1105 }
  1106 
  1107 /*----------------------------------------------------------------------
  1108  * Internal functions
  1109  *--------------------------------------------------------------------*/
  1110 
  1111 /*--------------------------------------------------------------------*/
  1112 
  1113 #if defined(M3G_SOFT_FLOAT)
  1114 
  1115 #if !defined (M3G_BUILD_ARM)
  1116 
  1117 static M3Gfloat m3gAdd(const M3Gfloat a, const M3Gfloat b)
  1118 {
  1119     return m3gFloatAdd(FLOAT_AS_UINT(a), FLOAT_AS_UINT(b));
  1120 }
  1121 
  1122 static M3Gfloat m3gMul(const M3Gfloat a, const M3Gfloat b)
  1123 {
  1124     return m3gFloatMul(FLOAT_AS_UINT(a), FLOAT_AS_UINT(b));
  1125 }
  1126 
  1127 #endif /* M3G_BUILD_ARM */
  1128 
  1129 /*!
  1130  * \internal
  1131  * \brief Computes the reciprocal of the square root
  1132  *
  1133  * \param x a floating point value
  1134  * \return 1 / square root of \c x
  1135  */
  1136 static M3Gfloat m3gRcpSqrt(const M3Gfloat x)
  1137 {
  1138     /* Approximation followed by Newton-Raphson iteration a'la
  1139      * "Floating-point tricks" by J. Blinn, but we iterate several
  1140      * times to improve precision */
  1141     
  1142     M3Gint i = (M3G_FLOAT_ONE + (M3G_FLOAT_ONE >> 1))
  1143         - (FLOAT_AS_UINT(x) >> 1);
  1144     M3Gfloat y = INT_AS_FLOAT(i);
  1145     for (i = 0; i < 3; ++i) {
  1146         y = m3gMul(y, m3gSub(1.5f, m3gHalf(m3gMul(x, m3gSquare(y)))));
  1147     }
  1148     return y;
  1149 }
  1150 
  1151 /*!
  1152  * \internal
  1153  * \brief Computes the square root
  1154  *
  1155  * \param x a floating point value
  1156  * \return square root of \c x
  1157  */
  1158 static M3Gfloat m3gSqrt(const M3Gfloat x)
  1159 {
  1160     /* Approximation followed by Newton-Raphson iteration a'la
  1161      * "Floating-point tricks" by J. Blinn, but we iterate several
  1162      * times to improve precision */
  1163 
  1164     M3Gint i = (FLOAT_AS_UINT(x) >> 1) + (M3G_FLOAT_ONE >> 1);
  1165     M3Gfloat y = INT_AS_FLOAT(i);
  1166     for (i = 0; i < 2; ++i) {
  1167         y = m3gDiv(m3gAdd(m3gSquare(y), x), m3gDouble(y));
  1168     }
  1169     return y;
  1170 }
  1171 
  1172 #endif /* M3G_SOFT_FLOAT */
  1173 
  1174 /*--------------------------------------------------------------------*/
  1175 
  1176 #if defined(M3G_SOFT_FLOAT)
  1177 /*!
  1178  * \internal
  1179  */
  1180 static M3Gfloat m3gArcCos(const M3Gfloat x)
  1181 {
  1182     return (M3Gfloat) acos(x);
  1183 }
  1184 
  1185 /*!
  1186  * \internal
  1187  */
  1188 static M3Gfloat m3gArcTan(const M3Gfloat y, const M3Gfloat x)
  1189 {
  1190     return (M3Gfloat) atan2(y, x);
  1191 }
  1192 
  1193 /*!
  1194  * \internal
  1195  */
  1196 static M3Gfloat m3gCos(const M3Gfloat x)
  1197 {
  1198     return m3gSin(m3gAdd(x, HALF_PI));
  1199 }
  1200 
  1201 /*!
  1202  * \internal
  1203  * \brief
  1204  */
  1205 static M3Gfloat m3gSin(const M3Gfloat x)
  1206 {
  1207     M3Gfloat f = x;
  1208     
  1209     /* If x is greater than two pi, do a modulo operation to bring it
  1210      * back in range for the internal sine function */
  1211     
  1212     if (m3gAbs(f) >= TWO_PI) {
  1213         f = m3gMul (f, (1.f / TWO_PI));
  1214         f = m3gFrac(f);
  1215         f = m3gMul (f, TWO_PI);
  1216     }
  1217 
  1218     /* Compute the result, negating both the input value and the
  1219      * result if x was negative */
  1220     {
  1221         M3Guint i = FLOAT_AS_UINT(f);
  1222         M3Guint neg = (i & SIGN_MASK);
  1223         i ^= neg;
  1224         f = m3gSinFirstPeriod(INT_AS_FLOAT(i));
  1225         i = FLOAT_AS_UINT(f) ^ neg;
  1226         return INT_AS_FLOAT(i);
  1227     }
  1228 }
  1229 
  1230 /*!
  1231  * \internal
  1232  */
  1233 static M3Gfloat m3gTan(const M3Gfloat x)
  1234 {
  1235     return (M3Gfloat) tan(x);
  1236 }
  1237 
  1238 /*!
  1239  * \internal
  1240  */
  1241 static M3Gfloat m3gExp(const M3Gfloat a)
  1242 {
  1243     return (M3Gfloat) exp(a);
  1244 }
  1245 #endif /* M3G_SOFT_FLOAT */
  1246 
  1247 /*!
  1248  * \brief Checks whether the bottom row of a matrix is 0 0 0 1
  1249  */
  1250 static M3Gbool m3gIsWUnity(const Matrix *mtx)
  1251 {
  1252     M3G_ASSERT_PTR(mtx);
  1253 
  1254     if (!m3gIsClassified(mtx)) {
  1255         return (IS_ZERO(M44F(mtx, 3, 0)) &&
  1256                 IS_ZERO(M44F(mtx, 3, 1)) &&
  1257                 IS_ZERO(M44F(mtx, 3, 2)) &&
  1258                 IS_ONE (M44F(mtx, 3, 3)));
  1259     }
  1260     else {
  1261         return ((mtx->mask & 0xC0C0C0C0) == (ELEM_ONE << 30));
  1262     }
  1263 }
  1264 
  1265 /*!
  1266  * \brief Makes a quaternion by exponentiating a quaternion logarithm
  1267  */
  1268 static void m3gExpQuat(Quat *quat, const Vec3 *qExp)
  1269 {
  1270     M3Gfloat theta;
  1271 
  1272     M3G_ASSERT_PTR(quat);
  1273     M3G_ASSERT_PTR(qExp);
  1274 
  1275     theta = m3gSqrt(m3gAdd(m3gAdd(m3gSquare(qExp->x),
  1276                                   m3gSquare(qExp->y)),
  1277                            m3gSquare(qExp->z)));
  1278 
  1279     if (theta > EPSILON) {
  1280         M3Gfloat s = m3gMul(m3gSin(theta), m3gRcp(theta));
  1281         quat->x = m3gMul(qExp->x, s);
  1282         quat->y = m3gMul(qExp->y, s);
  1283         quat->z = m3gMul(qExp->z, s);
  1284         quat->w = m3gCos(theta);
  1285     }
  1286     else {
  1287         quat->x = quat->y = quat->z = 0.0f;
  1288         quat->w = 1.0f;
  1289     }
  1290 }
  1291 
  1292 /*!
  1293  * \brief Natural logarithm of a unit quaternion.
  1294  */
  1295 static void m3gLogQuat(Vec3 *qLog, const Quat *quat)
  1296 {
  1297     M3Gfloat sinTheta = m3gSqrt(m3gNorm3((const float *) quat));
  1298 
  1299     if (sinTheta > EPSILON) {
  1300         M3Gfloat s = m3gArcTan(sinTheta, quat->w) / sinTheta;
  1301         qLog->x = m3gMul(s, quat->x);
  1302         qLog->y = m3gMul(s, quat->y);
  1303         qLog->z = m3gMul(s, quat->z);
  1304     }
  1305     else {
  1306         qLog->x = qLog->y = qLog->z = 0.0f;
  1307     }
  1308 }
  1309 
  1310 /*!
  1311  * \brief Make quaternion the "logarithmic difference" between two
  1312  * other quaternions.
  1313  */
  1314 static void m3gLogDiffQuat(Vec3 *logDiff,
  1315                            const Quat *from, const Quat *to)
  1316 {
  1317     Quat temp;
  1318     temp.x = m3gNegate(from->x);
  1319     temp.y = m3gNegate(from->y);
  1320     temp.z = m3gNegate(from->z);
  1321     temp.w =           from->w;
  1322     m3gMulQuat(&temp, to);
  1323     m3gLogQuat(logDiff, &temp);
  1324 }
  1325 
  1326 /*!
  1327  * \brief Rounds a float to the nearest integer
  1328  *
  1329  * Overflows are clamped to the maximum or minimum representable
  1330  * value.
  1331  */
  1332 static M3Gint m3gRoundToInt(const M3Gfloat a)
  1333 {
  1334     M3Guint base = FLOAT_AS_UINT(a);
  1335     M3Gint signMask, expo;
  1336 
  1337     /* Decompose the number into sign, exponent, and base number */
  1338     
  1339     signMask = ((M3Gint) base >> 31);   /* -> 0 or 0xFFFFFFFF */
  1340     expo = (M3Gint)((base >> 23) & 0xFF) - 127;
  1341     
  1342     /* First check for large values and return either the negative or
  1343      * the positive maximum integer in case of overflow.  The overflow
  1344      * check can be made on the exponent alone, as large floats are
  1345      * spaced several integer values apart so that nothing will
  1346      * overflow because of rounding later on */
  1347     
  1348     if (expo >= 31) {
  1349         return (M3Gint)((1U << 31) - 1 + (((M3Guint) signMask) & 1));
  1350     }
  1351 
  1352     /* Also check for underflow to avoid problems with shifting by
  1353      * more than 31 */
  1354 
  1355     if (expo < -1) {
  1356         return 0;
  1357     }
  1358     
  1359     /* Mask out the sign and exponent bits, shift the base number so
  1360      * that the lowest bit corresponds to one half, then add one
  1361      * (half) and shift to round to the closest integer. */
  1362 
  1363     base = (base | LEADING_ONE) << 8;   /* shift mantissa to 1.31 */
  1364     base =  base >> (30 - expo);        /* integer value as 31.1 */
  1365     base = (base + 1) >> 1;             /* round to nearest 32.0 */
  1366     
  1367     /* Factor in the sign (negate if originally negative) and
  1368      * return */
  1369 
  1370     return ((M3Gint) base ^ signMask) - signMask;
  1371 }
  1372 
  1373 /*!
  1374  * \brief Calculates ray-triangle intersection.
  1375  *
  1376  * http://www.acm.org/jgt/papers/MollerTrumbore97
  1377  */
  1378 static M3Gbool m3gIntersectTriangle(const Vec3 *orig, const Vec3 *dir,
  1379                                     const Vec3 *vert0, const Vec3 *vert1, const Vec3 *vert2,
  1380                                     Vec3 *tuv, M3Gint cullMode)
  1381 {
  1382     Vec3 edge1, edge2, tvec, pvec, qvec;
  1383     M3Gfloat det,inv_det;
  1384 
  1385     /* find vectors for two edges sharing vert0 */
  1386     edge1 = *vert1;
  1387     edge2 = *vert2;
  1388     m3gSubVec3(&edge1, vert0);
  1389     m3gSubVec3(&edge2, vert0);
  1390 
  1391     /* begin calculating determinant - also used to calculate U parameter */
  1392     m3gCross(&pvec, dir, &edge2);
  1393 
  1394     /* if determinant is near zero, ray lies in plane of triangle */
  1395     det = m3gDot3(&edge1, &pvec);
  1396 
  1397     if (cullMode == 0 && det <= 0) return M3G_FALSE;
  1398     if (cullMode == 1 && det >= 0) return M3G_FALSE;
  1399 
  1400     if (det > -EPSILON && det < EPSILON)
  1401         return M3G_FALSE;
  1402     inv_det = m3gRcp(det);
  1403 
  1404     /* calculate distance from vert0 to ray origin */
  1405     tvec = *orig;
  1406     m3gSubVec3(&tvec, vert0);
  1407 
  1408     /* calculate U parameter and test bounds */
  1409     tuv->y = m3gMul(m3gDot3(&tvec, &pvec), inv_det);
  1410     if (tuv->y < 0.0f || tuv->y > 1.0f)
  1411         return M3G_FALSE;
  1412 
  1413     /* prepare to test V parameter */
  1414     m3gCross(&qvec, &tvec, &edge1);
  1415 
  1416     /* calculate V parameter and test bounds */
  1417     tuv->z = m3gMul(m3gDot3(dir, &qvec), inv_det);
  1418     if (tuv->z < 0.0f || m3gAdd(tuv->y, tuv->z) > 1.0f)
  1419         return M3G_FALSE;
  1420 
  1421     /* calculate t, ray intersects triangle */
  1422     tuv->x = m3gMul(m3gDot3(&edge2, &qvec), inv_det);
  1423 
  1424     return M3G_TRUE;
  1425 }
  1426 
  1427 /*!
  1428  * \brief Calculates ray-box intersection.
  1429  *
  1430  * http://www.siggraph.org/education/materials/HyperGraph/raytrace/rtinter3.htm
  1431  */
  1432 
  1433 #define XO	orig->x
  1434 #define YO	orig->y
  1435 #define ZO	orig->z
  1436 #define XD	dir->x
  1437 #define YD	dir->y
  1438 #define ZD	dir->z
  1439 
  1440 #define XL	box->min[0]
  1441 #define YL	box->min[1]
  1442 #define ZL	box->min[2]
  1443 #define XH	box->max[0]
  1444 #define YH	box->max[1]
  1445 #define ZH	box->max[2]
  1446 
  1447 /*!
  1448  * \internal
  1449  * \brief Ray - bounding box intersection
  1450  *
  1451  */
  1452 static M3Gbool m3gIntersectBox(const Vec3 *orig, const Vec3 *dir, const AABB *box)
  1453 {
  1454 	M3Gfloat tnear = M3G_MIN_NEGATIVE_FLOAT;
  1455 	M3Gfloat tfar  = M3G_MAX_POSITIVE_FLOAT;
  1456 	M3Gfloat t1, t2, temp;
  1457 
  1458 	/* X slab */
  1459 	if(XD != 0) {
  1460 		t1 = m3gSub(XL, XO) / XD;
  1461 		t2 = m3gSub(XH, XO) / XD;
  1462 
  1463 		if(t1 > t2) {
  1464 			temp = t1;
  1465 			t1 = t2;
  1466 			t2 = temp;
  1467 		}
  1468 
  1469 		if(t1 > tnear) tnear = t1;
  1470 		if(t2 < tfar) tfar = t2;
  1471 
  1472 		if(tnear > tfar) return M3G_FALSE;
  1473 		if(tfar < 0) return M3G_FALSE;
  1474 	}
  1475 	else {
  1476 		if(XO > XH || XO < XL) return M3G_FALSE;
  1477 	}
  1478 
  1479 	/* Y slab */
  1480 	if(YD != 0) {
  1481 		t1 = m3gSub(YL, YO) / YD;
  1482 		t2 = m3gSub(YH, YO) / YD;
  1483 
  1484 		if(t1 > t2) {
  1485 			temp = t1;
  1486 			t1 = t2;
  1487 			t2 = temp;
  1488 		}
  1489 
  1490 		if(t1 > tnear) tnear = t1;
  1491 		if(t2 < tfar) tfar = t2;
  1492 
  1493 		if(tnear > tfar) return M3G_FALSE;
  1494 		if(tfar < 0) return M3G_FALSE;
  1495 	}
  1496 	else {
  1497 		if(YO > YH || YO < YL) return M3G_FALSE;
  1498 	}
  1499 
  1500 	/* Z slab */
  1501 	if(ZD != 0) {
  1502 		t1 = m3gSub(ZL, ZO) / ZD;
  1503 		t2 = m3gSub(ZH, ZO) / ZD;
  1504 
  1505 		if(t1 > t2) {
  1506 			temp = t1;
  1507 			t1 = t2;
  1508 			t2 = temp;
  1509 		}
  1510 
  1511 		if(t1 > tnear) tnear = t1;
  1512 		if(t2 < tfar) tfar = t2;
  1513 
  1514 		if(tnear > tfar) return M3G_FALSE;
  1515 		if(tfar < 0) return M3G_FALSE;
  1516 	}
  1517 	else {
  1518 		if(ZO > ZH || ZO < ZL) return M3G_FALSE;
  1519 	}
  1520 
  1521 	return M3G_TRUE;
  1522 }
  1523 
  1524 /*!
  1525  * \brief Calculates the intersection of two rectangles. Always fills
  1526  * the intersection result.
  1527  *
  1528  * \param dst   result of the intersection
  1529  * \param r1    rectangle 1
  1530  * \param r2    rectangle 2
  1531  */
  1532 static M3Gbool m3gIntersectRectangle(M3GRectangle *dst, M3GRectangle *r1, M3GRectangle *r2)
  1533 {
  1534     M3Gbool intersects = M3G_TRUE;
  1535     M3Gint min, max;
  1536 
  1537     max = (r1->x) >= (r2->x) ? (r1->x) : (r2->x);
  1538     min = (r1->x + r1->width) <= (r2->x + r2->width) ? (r1->x + r1->width) : (r2->x + r2->width);
  1539     if ((min - max) < 0) intersects = M3G_FALSE;
  1540     dst->x = max;
  1541     dst->width = min - max;
  1542 
  1543     max = (r1->y) >= (r2->y) ? (r1->y) : (r2->y);
  1544     min = (r1->y + r1->height) <= (r2->y + r2->height) ? (r1->y + r1->height) : (r2->y + r2->height);
  1545     if ((min - max) < 0) intersects = M3G_FALSE;
  1546     dst->y = max;
  1547     dst->height = min - max;
  1548 
  1549     return intersects;
  1550 }
  1551 
  1552 /*-------- float-to-int color conversions --------*/
  1553 
  1554 static M3Guint m3gAlpha1f(M3Gfloat a)
  1555 {
  1556     M3Guint alpha = (M3Guint) m3gFloatToByte(a);
  1557     return (alpha << 24) | M3G_RGB_MASK;
  1558 }
  1559 
  1560 static M3Guint m3gColor3f(M3Gfloat r, M3Gfloat g, M3Gfloat b)
  1561 {
  1562     return ((M3Guint) m3gFloatToByte(r) << 16)
  1563         |  ((M3Guint) m3gFloatToByte(g) <<  8)
  1564         |   (M3Guint) m3gFloatToByte(b)
  1565         |   M3G_ALPHA_MASK;
  1566 }
  1567 
  1568 static M3Guint m3gColor4f(M3Gfloat r, M3Gfloat g, M3Gfloat b, M3Gfloat a)
  1569 {
  1570     return ((M3Guint) m3gFloatToByte(r) << 16)
  1571         |  ((M3Guint) m3gFloatToByte(g) <<  8)
  1572         |   (M3Guint) m3gFloatToByte(b)
  1573         |  ((M3Guint) m3gFloatToByte(a) << 24);
  1574 }
  1575 
  1576 static void m3gFloatColor(M3Gint argb, M3Gfloat intensity, M3Gfloat *rgba)
  1577 {
  1578     /* NOTE we intentionally aim a bit high here -- some GL
  1579      * implementations may round down instead of closest */
  1580     
  1581     const M3Gfloat oneOver255 = (M3Gfloat)(1.0001 / 255.0);
  1582     
  1583 	rgba[0] = (M3Gfloat)((argb >> 16) & 0xFF);
  1584 	rgba[1] = (M3Gfloat)((argb >>  8) & 0xFF);
  1585 	rgba[2] = (M3Gfloat)((argb      ) & 0xFF);
  1586 	rgba[3] = (M3Gfloat)((argb >> 24) & 0xFF);
  1587     
  1588     m3gScale4(rgba, m3gMul(oneOver255, intensity));
  1589 }
  1590 
  1591 /*!
  1592  * \brief Converts the 3x3 submatrix of a matrix into fixed point
  1593  *
  1594  * The input matrix must be an affine one, i.e. the bottom row must be
  1595  * 0 0 0 1.  The output matrix is guaranteed to be such that it can be
  1596  * multiplied with a 16-bit 3-vector without overflowing, while using
  1597  * the 32-bit range optimally.
  1598  *
  1599  * \param mtx  the original matrix
  1600  * \param elem 9 shorts to hold the fixed point base numbers
  1601  * \return floating point exponent for the values in \c elem
  1602  *         (number of bits to shift left for actual values)
  1603  */
  1604 static M3Gint m3gGetFixedPoint3x3Basis(const Matrix *mtx, M3Gshort *elem)
  1605 {
  1606     M3Gint outExp;
  1607     M3Gint row, col;
  1608     const M3Guint *m;
  1609     
  1610     if (!mtx->complete) {
  1611         m3gFillClassifiedMatrix((Matrix*) mtx);
  1612     }
  1613     m = (const M3Guint*) mtx->elem;
  1614     
  1615     /* First, find the maximum exponent value in the whole matrix */
  1616 
  1617     outExp = 0;
  1618     for (col = 0; col < 3; ++col) {
  1619         for (row = 0; row < 3; ++row) {
  1620             M3Gint element = (M3Gint)(m[MELEM(row, col)] & ~SIGN_MASK);
  1621             outExp = M3G_MAX(outExp, element);
  1622         }
  1623     }
  1624     outExp >>= 23;
  1625 
  1626     /* Our candidate exponent is the maximum found plus 9, which is
  1627      * guaranteed to shift the maximum unsigned 24-bit mantissa (which
  1628      * always will have the high bit set) down to the signed 16-bit
  1629      * range */
  1630 
  1631     outExp += 9;
  1632     
  1633     /* Now proceed to sum each row and see what's the actual smallest
  1634      * exponent we can safely use without overflowing in a 16+16
  1635      * matrix-vector multiplication; this will win us one bit
  1636      * (doubling the precision) compared to the conservative approach
  1637      * of just shifting everything down by 10 bits */
  1638 
  1639     for (row = 0; row < 3; ++row) {
  1640 
  1641         /* Sum the absolute values on this row */
  1642             
  1643         M3Gint rowSum = 0;
  1644         for (col = 0; col < 3; ++col) {
  1645             M3Gint a = (M3Gint)(m[MELEM(row, col)] & ~SIGN_MASK);
  1646             M3Gint shift = outExp - (a >> 23);
  1647             M3G_ASSERT(shift < 265);
  1648                 
  1649             if (shift < 24) {
  1650                 rowSum += ((a & MANTISSA_MASK) | LEADING_ONE) >> shift;
  1651             }
  1652         }
  1653 
  1654         /* Now we have a 26-bit sum of the absolute values on this
  1655          * row, and shift that down until we fit the target range of
  1656          * [0, 65535].  Note that this still leaves *exactly* enough
  1657          * space for adding in an arbitrary 16-bit translation vector
  1658          * after multiplying with the matrix! */
  1659             
  1660         while (rowSum >= (1 << 16)) {
  1661             rowSum >>= 1;
  1662             ++outExp;
  1663         }
  1664     }
  1665 
  1666     /* De-bias the exponent, but add in an extra 23 to account for the
  1667      * decimal bits in the floating point mantissa values we started
  1668      * with (we're returning the exponent as "bits to shift left to
  1669      * get integers", so we're off by 23 from IEEE notation) */
  1670     
  1671     outExp = (outExp - 127) - 23;
  1672     
  1673     /* Finally, shift all the matrix elements to our final output
  1674      * precision */
  1675     
  1676     for (col = 0; col < 3; ++col) {
  1677         m3gFloatVecToShort(3, mtx->elem + MELEM(0, col), outExp, elem);
  1678         elem += 3;
  1679     }
  1680     return outExp;
  1681 }
  1682 
  1683 /*!
  1684  * \brief Gets the translation component of a matrix as fixed point
  1685  *
  1686  * \param mtx  the matrix
  1687  * \param elem 3 shorts to write the vector into
  1688  * \return floating point exponent for the values in \c elem
  1689  *         (number of bits to shift left for actual values)
  1690  */
  1691 static M3Gint m3gGetFixedPointTranslation(const Matrix *mtx, M3Gshort *elem)
  1692 {
  1693     const M3Guint *m;
  1694     
  1695     M3G_ASSERT(m3gIsWUnity(mtx));
  1696     if (!mtx->complete) {
  1697         m3gFillClassifiedMatrix((Matrix*) mtx);
  1698     }
  1699     m = (const M3Guint*) &mtx->elem[MELEM(0, 3)];
  1700 
  1701     /* Find the maximum exponent, then scale down by 9 bits from that
  1702      * to shift the unsigned 24-bit mantissas to the signed 16-bit
  1703      * range */
  1704     {
  1705         M3Gint outExp;
  1706         M3Guint maxElem = m[0] & ~SIGN_MASK;
  1707         maxElem = M3G_MAX(maxElem, m[1] & ~SIGN_MASK);
  1708         maxElem = M3G_MAX(maxElem, m[2] & ~SIGN_MASK);
  1709         
  1710         outExp = (M3Gint)(maxElem >> 23) - (127 + 23) + 9;
  1711         m3gFloatVecToShort(3, mtx->elem + MELEM(0, 3), outExp, elem);
  1712         return outExp;
  1713     }
  1714 }
  1715 
  1716 /*!
  1717  * \internal
  1718  * \brief Compute a bounding box enclosing two other boxes
  1719  *
  1720  * \param box   box to fit
  1721  * \param a     first box to enclose or NULL
  1722  * \param b     second box to enclose or NULL
  1723  * 
  1724  * \note If both input boxes are NULL, the box is not modified.
  1725  */
  1726 static void m3gFitAABB(AABB *box, const AABB *a, const AABB *b)
  1727 {
  1728     int i;
  1729 
  1730     M3G_ASSERT_PTR(box);
  1731     
  1732     if (a) {
  1733         m3gValidateAABB(a);
  1734     }
  1735     if (b) {
  1736         m3gValidateAABB(b);
  1737     }
  1738 
  1739     if (a && b) {
  1740         for (i = 0; i < 3; ++i) {
  1741             box->min[i] = M3G_MIN(a->min[i], b->min[i]);
  1742             box->max[i] = M3G_MAX(a->max[i], b->max[i]);
  1743         }
  1744     }
  1745     else if (a) {
  1746         *box = *a;
  1747     }
  1748     else if (b) {
  1749         *box = *b;
  1750     }
  1751 }
  1752 
  1753 /*
  1754  * \internal
  1755  * \brief Transform an axis-aligned bounding box with a matrix
  1756  *
  1757  * This results in a box that encloses the transformed original box.
  1758  * 
  1759  * Based on "Transforming Axis-Aligned Bounding Boxes" by Jim Arvo
  1760  * from Graphics Gems.
  1761  * 
  1762  * \note The bottom row of the matrix is ignored in the transformation.
  1763  */
  1764 static void m3gTransformAABB(AABB *box, const Matrix *mtx)
  1765 {
  1766     M3Gfloat boxMin[3], boxMax[3];
  1767     M3Gfloat newMin[3], newMax[3];
  1768 
  1769     m3gValidateAABB(box);
  1770     
  1771     if (!mtx->complete) {
  1772         m3gFillClassifiedMatrix((Matrix*) mtx);
  1773     }
  1774 
  1775     /* Get the original minimum and maximum extents as floats, and add
  1776      * the translation as the base for the transformed box */
  1777     {
  1778         int i;
  1779         for (i = 0; i < 3; ++i) {
  1780             boxMin[i] = box->min[i];
  1781             boxMax[i] = box->max[i];
  1782             newMin[i] = newMax[i] = M44F(mtx, i, 3);
  1783         }
  1784     }
  1785 
  1786     /* Transform into the new minimum and maximum coordinates using
  1787      * the upper left 3x3 part of the matrix */
  1788     {
  1789         M3Gint row, col;
  1790         
  1791         for (row = 0; row < 3; ++row) {
  1792             for (col = 0; col < 3; ++col) {
  1793                 M3Gfloat a = m3gMul(M44F(mtx, row, col), boxMin[col]);
  1794                 M3Gfloat b = m3gMul(M44F(mtx, row, col), boxMax[col]);
  1795                 
  1796                 if (a < b) { 
  1797                     newMin[row] = m3gAdd(newMin[row], a);
  1798                     newMax[row] = m3gAdd(newMax[row], b);
  1799                 }
  1800                 else { 
  1801                     newMin[row] = m3gAdd(newMin[row], b);
  1802                     newMax[row] = m3gAdd(newMax[row], a);
  1803                 }
  1804             }
  1805         }
  1806     }
  1807 
  1808     /* Write back into the bounding box */
  1809     {
  1810         int i;
  1811         for (i = 0; i < 3; ++i) {
  1812             box->min[i] = newMin[i];
  1813             box->max[i] = newMax[i];
  1814         }
  1815     }
  1816     
  1817     m3gValidateAABB(box);
  1818 }
  1819 
  1820 #if defined(M3G_DEBUG)
  1821 /*!
  1822  * \brief
  1823  */
  1824 static void m3gValidateAABB(const AABB *aabb)
  1825 {
  1826     m3gValidateFloats(6, (float*) aabb);
  1827 }
  1828 #endif
  1829 
  1830 /*----------------------------------------------------------------------
  1831  * Public functions
  1832  *--------------------------------------------------------------------*/
  1833 
  1834 /*!
  1835  * \brief Linear interpolation of vectors
  1836  *
  1837  * \param size     number of components
  1838  * \param vec      output vector
  1839  * \param s        interpolation factor
  1840  * \param start    initial value
  1841  * \param end      target value
  1842  */
  1843 #if defined(M3G_HW_FLOAT_VFPV2)
  1844 
  1845 __weak __asm void m3gLerp(M3Gint size,
  1846 				   M3Gfloat *vec,
  1847 				   M3Gfloat s,
  1848 				   const M3Gfloat *start, const M3Gfloat *end)
  1849 {
  1850 // r0 = size
  1851 // r1 = *vec
  1852 // r2 = s
  1853 // r3 = *start
  1854 // sp[0] = *end
  1855 
  1856 		EXPORT	m3gLerp[DYNAMIC]
  1857 		CODE32
  1858 /*
  1859     M3Gfloat sCompl = 1.0 - s;
  1860     for (i = 0; i < size; ++i) {
  1861         vec[i] = sCompl*start[i] + s*end[i];
  1862     }
  1863 */
  1864 //
  1865 // if size = 0, return
  1866 //
  1867 		CMP		r0, #0
  1868 		BXEQ	lr
  1869 
  1870 		FMSR	s0, r2
  1871 		MOV		r2, r3
  1872 		LDR		r3, [sp]
  1873 
  1874 		FLDS	s1,=1.0
  1875 		STMFD	sp!, {r4-r5}
  1876 		FSUBS	s2, s1, s0			// sCompl = 1 - s
  1877 
  1878 		FMRX	r4, FPSCR
  1879 		CMP		r0, #4
  1880 		BGT		_m3gLerp_over4Loop
  1881 
  1882 //
  1883 // if 1 < size <= 4
  1884 //
  1885 // set vector STRIDE = 1, LENGTH = 4
  1886 		BIC		r12, r4, #0x00370000
  1887 		ORR		r12, #(3<<16)
  1888 		FMXR	FPSCR, r12
  1889 
  1890 		FLDMIAS	r2!, {s4-s7}		// load the start[i] values
  1891 		FLDMIAS	r3!, {s8-s11}		// load the end[i] values
  1892 		FMULS	s12, s4, s2			// [s12-s15]  = [sCompl*start[0] .. sCompl*start[3]]
  1893 		FMACS	s12, s8, s0			// [s12-s15] += [    	s*end[0] ..        s*end[3]]
  1894 		CMP		r0, #1
  1895 		FSTS	s12, [r1]
  1896 		FSTSGT	s13, [r1, #4]
  1897 		CMP		r0, #3
  1898 		FSTSGE	s14, [r1, #8]
  1899 		FSTSGT	s15, [r1, #12]
  1900 
  1901 		FMXR	FPSCR, r4
  1902 
  1903 		LDMFD	sp!, {r4-r5}
  1904 
  1905 		BX		lr
  1906 //
  1907 // if size > 4, interpolate 8 values in one loop
  1908 //
  1909 _m3gLerp_over4Loop
  1910 
  1911 		FSTMFDD	sp!, {d8-d9}
  1912 		MOVS	r5, r0, ASR #3			// size/8
  1913 		SUB		r0, r0, r5, LSL #3		// tail length
  1914 
  1915 // set vector STRIDE = 1, LENGTH = 8
  1916 		BIC		r12, r4, #0x00370000
  1917 		ORR		r12, #(7<<16)
  1918 		FMXR	FPSCR, r12
  1919 
  1920 
  1921 _m3gLerp_alignedLoop
  1922 
  1923 		FLDMIASNE	r2!, {s8-s15}		// load the start[i] values
  1924 		FLDMIASNE	r3!, {s16-s23}		// load the end[i] values
  1925 		FMULSNE		s24, s8, s2			// [s16-s23]  = [ sCompl*start[0] sCompl*start[1] .. sCompl*start[7]]
  1926 		FMACSNE		s24, s16, s0		// [s16-s23] += [		 s*end[0]        s*end[1] ..		s*end[7]]
  1927 		FSTMIASNE	r1!, {s24-s31}
  1928 		SUBSNE		r5, #1
  1929 
  1930 		BNE			_m3gLerp_alignedLoop
  1931 
  1932 // process the 0-7 remaining values in the tail
  1933 
  1934 		CMP			r0, #1
  1935 		FLDMIASGE	r2!, {s8-s15}		
  1936 		FLDMIASGE	r3!, {s16-s23}		
  1937 		FMULSGE		s24, s8, s2			
  1938 		FMACSGE		s24, s16, s0		
  1939 		FSTSGE		s24, [r1]
  1940 		FSTSGT		s25, [r1, #4]
  1941 		CMP			r0, #3
  1942 		FSTSGE		s26, [r1, #8]
  1943 		FSTSGT		s27, [r1, #12]
  1944 		CMP			r0, #5
  1945 		FSTSGE		s28, [r1, #16]
  1946 		FSTSGT		s29, [r1, #20]
  1947 		CMP			r0, #7
  1948 		FSTSEQ		s30, [r1, #24]
  1949 
  1950 		FMXR	FPSCR, r4
  1951 
  1952 		FLDMFDD	sp!, {d8-d9}
  1953 		LDMFD	sp!, {r4-r5}
  1954 
  1955 		BX		lr
  1956 
  1957 }
  1958 #else /* #if defined(M3G_HW_FLOAT_VFPV2) */
  1959 
  1960 M3G_API void m3gLerp(M3Gint size,
  1961              M3Gfloat *vec,
  1962              M3Gfloat s,
  1963              const M3Gfloat *start, const M3Gfloat *end)
  1964 {
  1965     int i;
  1966 
  1967     M3Gfloat sCompl = m3gSub(1.f, s);
  1968     for (i = 0; i < size; ++i) {
  1969         vec[i] = m3gAdd(m3gMul(sCompl, start[i]), m3gMul(s, end[i]));
  1970     }
  1971 }
  1972 #endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
  1973 
  1974 /*!
  1975  * \brief Hermite spline interpolation of vectors
  1976  *
  1977  * \param size      number of components
  1978  * \param vec       output vector
  1979  * \param s         interpolation factor
  1980  * \param start     start value vector
  1981  * \param end       end value vector
  1982  * \param tStart    start tangent vector
  1983  * \param tEnd      end tangent vector
  1984  */
  1985 M3G_API void m3gHermite(M3Gint size,
  1986                         M3Gfloat *vec,
  1987                         M3Gfloat s,
  1988                         const M3Gfloat *start, const M3Gfloat *end,
  1989                         const M3Gfloat *tStart, const M3Gfloat *tEnd)
  1990 {
  1991     M3Gfloat s2 = m3gSquare(s);
  1992     M3Gfloat s3 = m3gMul(s2, s);
  1993     int i;
  1994     
  1995     for (i = 0; i < size; ++i) {
  1996         vec[i] =
  1997             m3gMadd(start[i],
  1998                     m3gAdd(m3gSub(m3gDouble(s3), m3gMul(3.f, s2)), 1.f),
  1999                     m3gMadd(end[i],
  2000                             m3gSub(m3gMul(3.f, s2), m3gDouble(s3)),
  2001                             m3gMadd(tStart[i],
  2002                                     m3gAdd(m3gSub(s3, m3gDouble(s2)), s),
  2003                                     m3gMul(tEnd[i],
  2004                                            m3gSub(s3, s2)))));
  2005 
  2006     }
  2007     
  2008     /*  vec = ( 2*s3 - 3*s2 + 1) * start
  2009             + (-2*s3 + 3*s2    ) * end
  2010             + (   s3 - 2*s2 + s) * tStart
  2011             + (   s3 -   s2    ) * tEnd;    */
  2012 }
  2013 
  2014 /*--------------------------------------------------------------------*/
  2015 
  2016 /*!
  2017  * \brief Sets a matrix to a copy of another matrix
  2018  */
  2019 M3G_API void m3gCopyMatrix(Matrix *dst, const Matrix *src)
  2020 {
  2021     M3G_ASSERT(dst != NULL && src != NULL);
  2022     *dst = *src;
  2023 }
  2024 
  2025 /*!
  2026  * \brief Vector addition
  2027  */
  2028 M3G_API void m3gAddVec3(Vec3 *vec, const Vec3 *other)
  2029 {
  2030     vec->x = m3gAdd(vec->x, other->x);
  2031     vec->y = m3gAdd(vec->y, other->y);
  2032     vec->z = m3gAdd(vec->z, other->z);
  2033 }
  2034 
  2035 /*!
  2036  * \brief Vector addition
  2037  */
  2038 M3G_API void m3gAddVec4(Vec4 *vec, const Vec4 *other)
  2039 {
  2040     vec->x = m3gAdd(vec->x, other->x);
  2041     vec->y = m3gAdd(vec->y, other->y);
  2042     vec->z = m3gAdd(vec->z, other->z);
  2043     vec->w = m3gAdd(vec->w, other->w);
  2044 }
  2045 
  2046 /*!
  2047  * \brief Cross product of two 3D vectors expressed as 4D vectors
  2048  */
  2049 M3G_API void m3gCross(Vec3 *dst, const Vec3 *a, const Vec3 *b)
  2050 {
  2051     dst->x = m3gSub(m3gMul(a->y, b->z), m3gMul(a->z, b->y));
  2052     dst->y = m3gSub(m3gMul(a->z, b->x), m3gMul(a->x, b->z));
  2053     dst->z = m3gSub(m3gMul(a->x, b->y), m3gMul(a->y, b->x));
  2054 }
  2055 
  2056 /*!
  2057  * \brief Dot product of two vectors
  2058  */
  2059 M3G_API M3Gfloat m3gDot3(const Vec3 *a, const Vec3 *b)
  2060 {
  2061     M3Gfloat d;
  2062     d = m3gMul(a->x, b->x);
  2063     d = m3gMadd(a->y, b->y, d);
  2064     d = m3gMadd(a->z, b->z, d);
  2065     return d;
  2066 }
  2067 
  2068 /*!
  2069  * \brief Dot product of two vectors
  2070  */
  2071 M3G_API M3Gfloat m3gDot4(const Vec4 *a, const Vec4 *b)
  2072 {
  2073     M3Gfloat d;
  2074     d = m3gMul(a->x, b->x);
  2075     d = m3gMadd(a->y, b->y, d);
  2076     d = m3gMadd(a->z, b->z, d);
  2077     d = m3gMadd(a->w, b->w, d);
  2078     return d;
  2079 }
  2080 
  2081 /*!
  2082  * \brief
  2083  */
  2084 M3G_API void m3gSetVec3(Vec3 *v, M3Gfloat x, M3Gfloat y, M3Gfloat z)
  2085 {
  2086     M3G_ASSERT_PTR(v);
  2087     v->x = x;
  2088     v->y = y;
  2089     v->z = z;
  2090 }
  2091 
  2092 /*!
  2093  * \brief
  2094  */
  2095 M3G_API void m3gSetVec4(Vec4 *v, M3Gfloat x, M3Gfloat y, M3Gfloat z, M3Gfloat w)
  2096 {
  2097     M3G_ASSERT_PTR(v);
  2098     v->x = x;
  2099     v->y = y;
  2100     v->z = z;
  2101     v->w = w;
  2102 }
  2103 
  2104 /*!
  2105  * \brief Vector subtraction
  2106  */
  2107 M3G_API void m3gSubVec3(Vec3 *vec, const Vec3 *other)
  2108 {
  2109     vec->x = m3gSub(vec->x, other->x);
  2110     vec->y = m3gSub(vec->y, other->y);
  2111     vec->z = m3gSub(vec->z, other->z);
  2112 }
  2113 
  2114 /*!
  2115  * \brief Vector subtraction
  2116  */
  2117 M3G_API void m3gSubVec4(Vec4 *vec, const Vec4 *other)
  2118 {
  2119     vec->x = m3gSub(vec->x, other->x);
  2120     vec->y = m3gSub(vec->y, other->y);
  2121     vec->z = m3gSub(vec->z, other->z);
  2122     vec->w = m3gSub(vec->w, other->w);
  2123 }
  2124 
  2125 /*!
  2126  * \brief Vector length
  2127  */
  2128 M3G_API M3Gfloat m3gLengthVec3(const Vec3 *vec)
  2129 {
  2130     return m3gSqrt(m3gAdd(m3gAdd(m3gSquare(vec->x),
  2131                                  m3gSquare(vec->y)),
  2132                           m3gSquare(vec->z)));
  2133 }
  2134 
  2135 /*!
  2136  * \brief Vector scaling
  2137  */
  2138 M3G_API void m3gScaleVec3(Vec3 *vec, const M3Gfloat s)
  2139 {
  2140     vec->x = m3gMul(vec->x, s);
  2141     vec->y = m3gMul(vec->y, s);
  2142     vec->z = m3gMul(vec->z, s);
  2143 }
  2144 
  2145 /*!
  2146  * \brief Vector scaling
  2147  */
  2148 M3G_API void m3gScaleVec4(Vec4 *vec, const M3Gfloat s)
  2149 {
  2150     vec->x = m3gMul(vec->x, s);
  2151     vec->y = m3gMul(vec->y, s);
  2152     vec->z = m3gMul(vec->z, s);
  2153     vec->w = m3gMul(vec->w, s);
  2154 }
  2155 
  2156 /*!
  2157  * \brief Returns an angle-axis representation for a quaternion
  2158  *
  2159  * \note There are many, and this is not guaranteed to return a
  2160  * particular one
  2161  */
  2162 M3G_API void m3gGetAngleAxis(const Quat *quat, M3Gfloat *angle, Vec3 *axis)
  2163 {
  2164     M3Gfloat x, y, z, sinTheta;
  2165 
  2166     M3G_ASSERT_PTR(quat);
  2167     M3G_ASSERT_PTR(angle);
  2168     M3G_ASSERT_PTR(axis);
  2169 
  2170     x = quat->x;
  2171     y = quat->y;
  2172     z = quat->z;
  2173 
  2174     sinTheta = m3gSqrt(m3gAdd(m3gAdd(m3gSquare(x), m3gSquare(y)),
  2175                               m3gSquare(z)));
  2176 
  2177     if (sinTheta > EPSILON) {
  2178         M3Gfloat ooSinTheta = m3gRcp(sinTheta);
  2179         axis->x = m3gMul(x, ooSinTheta);
  2180         axis->y = m3gMul(y, ooSinTheta);
  2181         axis->z = m3gMul(z, ooSinTheta);
  2182     }
  2183     else {
  2184         /* return a valid axis even for no rotation */
  2185         axis->x = axis->y = 0.0f;
  2186         axis->z = 1.0f;
  2187     }
  2188     *angle = m3gMul(2.0f * RAD2DEG, m3gArcCos(quat->w));
  2189 }
  2190 
  2191 /*!
  2192  * \brief Gets a single matrix column
  2193  */
  2194 M3G_API void m3gGetMatrixColumn(const Matrix *mtx, M3Gint col, Vec4 *dst)
  2195 {
  2196     if ((col & ~3) == 0) {
  2197         if (!mtx->complete) {
  2198             m3gFillClassifiedMatrix((Matrix*)mtx);
  2199         }
  2200         dst->x = M44F(mtx, 0, col);
  2201         dst->y = M44F(mtx, 1, col);
  2202         dst->z = M44F(mtx, 2, col);
  2203         dst->w = M44F(mtx, 3, col);
  2204     }
  2205     else {
  2206         M3G_ASSERT(M3G_FALSE);
  2207     }
  2208 }
  2209 
  2210 /*!
  2211  * \brief Returns the floating point values of a matrix as consecutive
  2212  * columns
  2213  */
  2214 M3G_API void m3gGetMatrixColumns(const Matrix *mtx, M3Gfloat *dst)
  2215 {
  2216     M3G_ASSERT(mtx != NULL && dst != NULL);
  2217 
  2218     if (!mtx->complete) {
  2219         m3gFillClassifiedMatrix((Matrix*)mtx);
  2220     }
  2221     m3gCopy(dst, mtx->elem, sizeof(mtx->elem));
  2222 }
  2223 
  2224 /*!
  2225  * \brief Gets a single matrix row
  2226  */
  2227 M3G_API void m3gGetMatrixRow(const Matrix *mtx, M3Gint row, Vec4 *dst)
  2228 {
  2229     if ((row & ~3) == 0) {
  2230         if (!mtx->complete) {
  2231             m3gFillClassifiedMatrix((Matrix*)mtx);
  2232         }
  2233         dst->x = M44F(mtx, row, 0);
  2234         dst->y = M44F(mtx, row, 1);
  2235         dst->z = M44F(mtx, row, 2);
  2236         dst->w = M44F(mtx, row, 3);
  2237     }
  2238     else {
  2239         M3G_ASSERT(M3G_FALSE);
  2240     }
  2241 }
  2242 
  2243 /*!
  2244  * \brief Returns the floating point values of a matrix as consecutive
  2245  * rows
  2246  */
  2247 M3G_API void m3gGetMatrixRows(const Matrix *mtx, M3Gfloat *dst)
  2248 {
  2249     M3G_ASSERT(mtx != NULL && dst != NULL);
  2250 
  2251     if (!mtx->complete) {
  2252         m3gFillClassifiedMatrix((Matrix*)mtx);
  2253     }
  2254     {
  2255         int row;
  2256         for (row = 0; row < 4; ++row) {
  2257             *dst++ = mtx->elem[ 0 + row];
  2258             *dst++ = mtx->elem[ 4 + row];
  2259             *dst++ = mtx->elem[ 8 + row];
  2260             *dst++ = mtx->elem[12 + row];
  2261         }
  2262     }
  2263 }
  2264 
  2265 /*!
  2266  * \brief Sets a matrix to identity
  2267  */
  2268 M3G_API void m3gIdentityMatrix(Matrix *mtx)
  2269 {
  2270     M3G_ASSERT(mtx != NULL);
  2271     m3gClassifyAs(MC_IDENTITY, mtx);
  2272 }
  2273 
  2274 /*!
  2275  * \brief Sets a quaternion to identity
  2276  */
  2277 M3G_API void m3gIdentityQuat(Quat *quat)
  2278 {
  2279     M3G_ASSERT(quat != NULL);
  2280     quat->x = quat->y = quat->z = 0.0f;
  2281     quat->w = 1.0f;
  2282 }
  2283 
  2284 /*!
  2285  * \brief Inverts a matrix
  2286  */
  2287 M3G_API M3Gbool m3gInvertMatrix(Matrix *mtx)
  2288 {
  2289     M3Gfloat *matrix;
  2290     M3Gint i;
  2291     M3Gfloat tmp[12];
  2292     M3Gfloat src[16];
  2293     M3Gfloat det;
  2294 
  2295     M3G_ASSERT(mtx != NULL);
  2296 
  2297     if (!m3gIsClassified(mtx)) {
  2298         m3gClassify(mtx);
  2299     }
  2300 
  2301     /* Quick exit for identity */
  2302     
  2303     if (mtx->mask == MC_IDENTITY) {
  2304         return M3G_TRUE;
  2305     }
  2306 
  2307     /* Look for other common cases; these require that we have valid
  2308      * values in all the elements, so fill in the values first */
  2309     {
  2310         M3Guint mask = mtx->mask;
  2311         
  2312         if (!mtx->complete) {
  2313             m3gFillClassifiedMatrix(mtx);
  2314         }
  2315         
  2316         if ((mask | (0x3F << 24)) == MC_TRANSLATION) {
  2317             M44F(mtx, 0, 3) = m3gNegate(M44F(mtx, 0, 3));
  2318             M44F(mtx, 1, 3) = m3gNegate(M44F(mtx, 1, 3));
  2319             M44F(mtx, 2, 3) = m3gNegate(M44F(mtx, 2, 3));
  2320             mtx->mask = MC_TRANSLATION;
  2321             return M3G_TRUE;
  2322         }
  2323         if ((mask | 0x300C03) == MC_SCALING) {
  2324             if ((mask &  3       ) == 0 ||
  2325                 (mask & (3 << 10)) == 0 ||
  2326                 (mask & (3 << 20)) == 0) {
  2327                 return M3G_FALSE; /* zero scale for at least one axis */
  2328             }
  2329             M44F(mtx, 0, 0) = m3gRcp(M44F(mtx, 0, 0));
  2330             M44F(mtx, 1, 1) = m3gRcp(M44F(mtx, 1, 1));
  2331             M44F(mtx, 2, 2) = m3gRcp(M44F(mtx, 2, 2));
  2332             return M3G_TRUE;
  2333         }
  2334     }
  2335 
  2336     /* Do a full 4x4 inversion as a last resort */
  2337         
  2338 	matrix = mtx->elem;
  2339 
  2340     /* transpose matrix */
  2341     for (i = 0; i < 4; i++) {
  2342         src[i] = matrix[i*4];
  2343         src[i+4] = matrix[i*4+1];
  2344         src[i+8] = matrix[i*4+2];
  2345         src[i+12] = matrix[i*4+3];
  2346     }
  2347 
  2348     /* calculate pairs for first 8 elements (cofactors) */
  2349     tmp[0] = src[10]*src[15];
  2350     tmp[1] = src[11]*src[14];
  2351     tmp[2] = src[9]*src[15];
  2352     tmp[3] = src[11]*src[13];
  2353     tmp[4] = src[9]*src[14];
  2354     tmp[5] = src[10]*src[13];
  2355     tmp[6] = src[8]*src[15];
  2356     tmp[7] = src[11]*src[12];
  2357     tmp[8] = src[8]*src[14];
  2358     tmp[9] = src[10]*src[12];
  2359     tmp[10] = src[8]*src[13];
  2360     tmp[11] = src[9]*src[12];
  2361 
  2362     /* calculate first 8 elements (cofactors) */
  2363     matrix[0] = tmp[0]*src[5] + tmp[3]*src[6] + tmp[4]*src[7];
  2364     matrix[0] -= tmp[1]*src[5] + tmp[2]*src[6] + tmp[5]*src[7];
  2365     matrix[1] = tmp[1]*src[4] + tmp[6]*src[6] + tmp[9]*src[7];
  2366     matrix[1] -= tmp[0]*src[4] + tmp[7]*src[6] + tmp[8]*src[7];
  2367     matrix[2] = tmp[2]*src[4] + tmp[7]*src[5] + tmp[10]*src[7];
  2368     matrix[2] -= tmp[3]*src[4] + tmp[6]*src[5] + tmp[11]*src[7];
  2369     matrix[3] = tmp[5]*src[4] + tmp[8]*src[5] + tmp[11]*src[6];
  2370     matrix[3] -= tmp[4]*src[4] + tmp[9]*src[5] + tmp[10]*src[6];
  2371     matrix[4] = tmp[1]*src[1] + tmp[2]*src[2] + tmp[5]*src[3];
  2372     matrix[4] -= tmp[0]*src[1] + tmp[3]*src[2] + tmp[4]*src[3];
  2373     matrix[5] = tmp[0]*src[0] + tmp[7]*src[2] + tmp[8]*src[3];
  2374     matrix[5] -= tmp[1]*src[0] + tmp[6]*src[2] + tmp[9]*src[3];
  2375     matrix[6] = tmp[3]*src[0] + tmp[6]*src[1] + tmp[11]*src[3];
  2376     matrix[6] -= tmp[2]*src[0] + tmp[7]*src[1] + tmp[10]*src[3];
  2377     matrix[7] = tmp[4]*src[0] + tmp[9]*src[1] + tmp[10]*src[2];
  2378     matrix[7] -= tmp[5]*src[0] + tmp[8]*src[1] + tmp[11]*src[2];
  2379 
  2380     /* calculate pairs for second 8 elements (cofactors) */
  2381     tmp[0] = src[2]*src[7];
  2382     tmp[1] = src[3]*src[6];
  2383     tmp[2] = src[1]*src[7];
  2384     tmp[3] = src[3]*src[5];
  2385     tmp[4] = src[1]*src[6];
  2386     tmp[5] = src[2]*src[5];
  2387     tmp[6] = src[0]*src[7];
  2388     tmp[7] = src[3]*src[4];
  2389     tmp[8] = src[0]*src[6];
  2390     tmp[9] = src[2]*src[4];
  2391     tmp[10] = src[0]*src[5];
  2392     tmp[11] = src[1]*src[4];
  2393 
  2394     /* calculate second 8 elements (cofactors) */
  2395     matrix[8] = tmp[0]*src[13] + tmp[3]*src[14] + tmp[4]*src[15];
  2396     matrix[8] -= tmp[1]*src[13] + tmp[2]*src[14] + tmp[5]*src[15];
  2397     matrix[9] = tmp[1]*src[12] + tmp[6]*src[14] + tmp[9]*src[15];
  2398     matrix[9] -= tmp[0]*src[12] + tmp[7]*src[14] + tmp[8]*src[15];
  2399     matrix[10] = tmp[2]*src[12] + tmp[7]*src[13] + tmp[10]*src[15];
  2400     matrix[10] -= tmp[3]*src[12] + tmp[6]*src[13] + tmp[11]*src[15];
  2401     matrix[11] = tmp[5]*src[12] + tmp[8]*src[13] + tmp[11]*src[14];
  2402     matrix[11] -= tmp[4]*src[12] + tmp[9]*src[13] + tmp[10]*src[14];
  2403     matrix[12] = tmp[2]*src[10] + tmp[5]*src[11] + tmp[1]*src[9];
  2404     matrix[12] -= tmp[4]*src[11] + tmp[0]*src[9] + tmp[3]*src[10];
  2405     matrix[13] = tmp[8]*src[11] + tmp[0]*src[8] + tmp[7]*src[10];
  2406     matrix[13] -= tmp[6]*src[10] + tmp[9]*src[11] + tmp[1]*src[8];
  2407     matrix[14] = tmp[6]*src[9] + tmp[11]*src[11] + tmp[3]*src[8];
  2408     matrix[14] -= tmp[10]*src[11] + tmp[2]*src[8] + tmp[7]*src[9];
  2409     matrix[15] = tmp[10]*src[10] + tmp[4]*src[8] + tmp[9]*src[9];
  2410     matrix[15] -= tmp[8]*src[9] + tmp[11]*src[10] + tmp[5]*src[8];
  2411 
  2412     /* calculate determinant */
  2413     det = src[0]*matrix[0]+src[1]*matrix[1]+src[2]*matrix[2]+src[3]*matrix[3];
  2414 
  2415     /* matrix has no inverse */
  2416     if (det == 0.0f) {
  2417         return M3G_FALSE;
  2418     }
  2419 
  2420     /* calculate matrix inverse */
  2421     det = 1/det;
  2422     for (i = 0; i < 16; i++) {
  2423         matrix[i] *= det;
  2424     }
  2425 
  2426     mtx->classified = M3G_FALSE;
  2427 	return M3G_TRUE;
  2428 }
  2429 
  2430 /*!
  2431  * \brief Sets a matrix to the inverse of another matrix
  2432  */
  2433 M3G_API M3Gbool m3gMatrixInverse(Matrix *mtx, const Matrix *other)
  2434 {
  2435     M3G_ASSERT(mtx != NULL && other != NULL);
  2436 
  2437     if (!m3gIsClassified(other)) {
  2438         m3gClassify((Matrix*)other);
  2439     }
  2440 
  2441 	m3gCopyMatrix(mtx, other);
  2442 	return m3gInvertMatrix(mtx);
  2443 }
  2444 
  2445 /*!
  2446  * \brief Sets a matrix to the transpose of another matrix
  2447  */
  2448 M3G_API void m3gMatrixTranspose(Matrix *mtx, const Matrix *other)
  2449 {
  2450     M3Gbyte i;
  2451     M3G_ASSERT(mtx != NULL && other != NULL);
  2452 
  2453     if (!other->complete) {
  2454         m3gFillClassifiedMatrix((Matrix *)other);
  2455     }
  2456 
  2457     for (i = 0; i < 4; i++) {
  2458         mtx->elem[i] = other->elem[i*4];
  2459         mtx->elem[i+4] = other->elem[i*4+1];
  2460         mtx->elem[i+8] = other->elem[i*4+2];
  2461         mtx->elem[i+12] = other->elem[i*4+3];
  2462     }
  2463     mtx->classified = M3G_FALSE;
  2464     mtx->complete = M3G_TRUE;
  2465 }
  2466 
  2467 M3G_API M3Gbool m3gInverseTranspose(Matrix *mtx, const Matrix *other)
  2468 {
  2469     Matrix temp;
  2470     if (!m3gMatrixInverse(&temp, other)) {
  2471         return M3G_FALSE;
  2472     }
  2473     m3gMatrixTranspose(mtx, &temp);
  2474     return M3G_TRUE;
  2475 }
  2476 
  2477 /*!
  2478  * \brief Sets a matrix to the product of two other matrices
  2479  *
  2480  * \note \c dst can not be either of \c left or \c right; if it is,
  2481  * the results are undefined
  2482  */
  2483 M3G_API void m3gMatrixProduct(Matrix *dst, const Matrix *left, const Matrix *right)
  2484 {
  2485     M3G_ASSERT_PTR(dst);
  2486     M3G_ASSERT_PTR(left);
  2487     M3G_ASSERT_PTR(right);
  2488     M3G_ASSERT(dst != left && dst != right);
  2489 
  2490     /* Classify input matrices and take early exits for identities */
  2491 
  2492     if (!m3gIsClassified(left)) {
  2493         m3gClassify((Matrix*)left);
  2494     }
  2495     if (left->mask == MC_IDENTITY) {
  2496         m3gCopyMatrix(dst, right);
  2497         return;
  2498     }
  2499 
  2500     if (!m3gIsClassified(right)) {
  2501         m3gClassify((Matrix*)right);
  2502     }
  2503     if (right->mask == MC_IDENTITY) {
  2504         m3gCopyMatrix(dst, left);
  2505         return;
  2506     }
  2507 
  2508     /* Special quick paths for 3x4 matrices */
  2509     
  2510     if (m3gIsWUnity(left) && m3gIsWUnity(right)) {
  2511 
  2512         /* Translation? */
  2513         
  2514         if ((left->mask & ~MC_TRANSLATION_PART) == MC_IDENTITY) {
  2515             
  2516             if (left->mask != MC_TRANSLATION && !left->complete) {
  2517                 m3gFillClassifiedMatrix((Matrix*)left);
  2518             }
  2519             if (right->mask != MC_TRANSLATION && !right->complete) {
  2520                 m3gFillClassifiedMatrix((Matrix*)right);
  2521             }
  2522 
  2523             m3gCopyMatrix(dst, right);
  2524             
  2525             M44F(dst, 0, 3) = m3gAdd(M44F(left, 0, 3), M44F(dst, 0, 3));
  2526             M44F(dst, 1, 3) = m3gAdd(M44F(left, 1, 3), M44F(dst, 1, 3));
  2527             M44F(dst, 2, 3) = m3gAdd(M44F(left, 2, 3), M44F(dst, 2, 3));
  2528             
  2529             dst->mask |= MC_TRANSLATION_PART;
  2530             return;
  2531         }
  2532 
  2533         if ((right->mask & ~MC_TRANSLATION_PART) == MC_IDENTITY) {
  2534             Vec4 tvec;
  2535 
  2536             if (left->mask != MC_TRANSLATION && !left->complete) {
  2537                 m3gFillClassifiedMatrix((Matrix*)left);
  2538             }
  2539             if (right->mask != MC_TRANSLATION && !right->complete) {
  2540                 m3gFillClassifiedMatrix((Matrix*)right);
  2541             }
  2542 
  2543             m3gCopyMatrix(dst, left);
  2544             
  2545             m3gGetMatrixColumn(right, 3, &tvec);
  2546             m3gTransformVec4(dst, &tvec);
  2547             
  2548             M44F(dst, 0, 3) = tvec.x;
  2549             M44F(dst, 1, 3) = tvec.y;
  2550             M44F(dst, 2, 3) = tvec.z;
  2551             
  2552             dst->mask |= MC_TRANSLATION_PART;
  2553             return;
  2554         }
  2555 
  2556     }
  2557         
  2558     /* Compute product and set output classification */
  2559 
  2560     m3gGenericMatrixProduct(dst, left, right);
  2561 }
  2562 
  2563 /*!
  2564  * \brief Normalizes a quaternion
  2565  */
  2566 M3G_API void m3gNormalizeQuat(Quat *q)
  2567 {
  2568     M3Gfloat norm;
  2569     M3G_ASSERT_PTR(q);
  2570     
  2571     norm = m3gNorm4(&q->x);
  2572     
  2573     if (norm > EPSILON) {
  2574         norm = m3gRcpSqrt(norm);
  2575         m3gScale4(&q->x, norm);
  2576     }
  2577     else {
  2578         m3gIdentityQuat(q);
  2579     }
  2580 }
  2581 
  2582 /*!
  2583  * \brief Normalizes a three-vector
  2584  */
  2585 M3G_API void m3gNormalizeVec3(Vec3 *v)
  2586 {
  2587     M3Gfloat norm;
  2588     M3G_ASSERT_PTR(v);
  2589     
  2590     norm = m3gNorm3(&v->x);
  2591     
  2592     if (norm > EPSILON) {
  2593         norm = m3gRcpSqrt(norm);
  2594         m3gScale3(&v->x, norm);
  2595     }
  2596     else {
  2597         m3gZero(v, sizeof(Vec3));
  2598     }
  2599 }
  2600 
  2601 /*!
  2602  * \brief Normalizes a four-vector
  2603  */
  2604 M3G_API void m3gNormalizeVec4(Vec4 *v)
  2605 {
  2606     M3Gfloat norm;
  2607     M3G_ASSERT_PTR(v);
  2608     
  2609     norm = m3gNorm4(&v->x);
  2610     
  2611     if (norm > EPSILON) {
  2612         norm = m3gRcpSqrt(norm);
  2613         m3gScale4(&v->x, norm);
  2614     }
  2615     else {
  2616         m3gZero(v, sizeof(Vec4));
  2617     }
  2618 }
  2619 
  2620 /*!
  2621  * \brief Multiplies a matrix from the right with another matrix
  2622  */
  2623 M3G_API void m3gPostMultiplyMatrix(Matrix *mtx, const Matrix *other)
  2624 {
  2625     Matrix temp;
  2626     M3G_ASSERT_PTR(mtx);
  2627     M3G_ASSERT_PTR(other);
  2628 
  2629     m3gCopyMatrix(&temp, mtx);
  2630     m3gMatrixProduct(mtx, &temp, other);
  2631 }
  2632 
  2633 /*!
  2634  * \brief Multiplies a matrix from the left with another matrix
  2635  */
  2636 M3G_API void m3gPreMultiplyMatrix(Matrix *mtx, const Matrix *other)
  2637 {
  2638     Matrix temp;
  2639     M3G_ASSERT_PTR(mtx);
  2640     M3G_ASSERT_PTR(other);
  2641 
  2642     m3gCopyMatrix(&temp, mtx);
  2643     m3gMatrixProduct(mtx, other, &temp);
  2644 }
  2645 
  2646 /*!
  2647  * \brief Multiplies a matrix with a rotation matrix.
  2648  */
  2649 M3G_API void m3gPostRotateMatrix(Matrix *mtx,
  2650                          M3Gfloat angle,
  2651                          M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
  2652 {
  2653     Quat q;
  2654     m3gSetAngleAxis(&q, angle, ax, ay, az);
  2655     m3gPostRotateMatrixQuat(mtx, &q);
  2656 }
  2657 
  2658 /*!
  2659  * \brief Multiplies a matrix with a translation matrix.
  2660  */
  2661 M3G_API void m3gPostRotateMatrixQuat(Matrix *mtx, const Quat *quat)
  2662 {
  2663     Matrix temp;
  2664     m3gQuatMatrix(&temp, quat);
  2665     m3gPostMultiplyMatrix(mtx, &temp);
  2666 }
  2667 
  2668 /*!
  2669  * \brief Multiplies a matrix with a scale matrix.
  2670  */
  2671 M3G_API void m3gPostScaleMatrix(Matrix *mtx, M3Gfloat sx, M3Gfloat sy, M3Gfloat sz)
  2672 {
  2673     Matrix temp;
  2674     m3gScalingMatrix(&temp, sx, sy, sz);
  2675     m3gPostMultiplyMatrix(mtx, &temp);
  2676 }
  2677 
  2678 /*!
  2679  * \brief Multiplies a matrix with a translation (matrix).
  2680  */
  2681 M3G_API void m3gPostTranslateMatrix(Matrix *mtx,
  2682                             M3Gfloat tx, M3Gfloat ty, M3Gfloat tz)
  2683 {
  2684     Matrix temp;
  2685     m3gTranslationMatrix(&temp, tx, ty, tz);
  2686     m3gPostMultiplyMatrix(mtx, &temp);
  2687 }
  2688 
  2689 /*!
  2690  * \brief Multiplies a matrix with a rotation matrix
  2691  */
  2692 M3G_API void m3gPreRotateMatrix(Matrix *mtx,
  2693                         M3Gfloat angle,
  2694                         M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
  2695 {
  2696     Quat q;
  2697     m3gSetAngleAxis(&q, angle, ax, ay, az);
  2698     m3gPreRotateMatrixQuat(mtx, &q);
  2699 }
  2700 
  2701 /*!
  2702  * \brief Multiplies a matrix with a quaternion rotation
  2703  */
  2704 M3G_API void m3gPreRotateMatrixQuat(Matrix *mtx, const Quat *quat)
  2705 {
  2706     Matrix temp;
  2707     m3gQuatMatrix(&temp, quat);
  2708     m3gPreMultiplyMatrix(mtx, &temp);
  2709 }
  2710 
  2711 /*!
  2712  * \brief Multiplies a matrix with a scale matrix.
  2713  */
  2714 M3G_API void m3gPreScaleMatrix(Matrix *mtx, M3Gfloat sx, M3Gfloat sy, M3Gfloat sz)
  2715 {
  2716     Matrix temp;
  2717     m3gScalingMatrix(&temp, sx, sy, sz);
  2718     m3gPreMultiplyMatrix(mtx, &temp);
  2719 }
  2720 
  2721 /*!
  2722  * \brief Multiplies a matrix with a translation (matrix).
  2723  */
  2724 M3G_API void m3gPreTranslateMatrix(Matrix *mtx,
  2725                            M3Gfloat tx, M3Gfloat ty, M3Gfloat tz)
  2726 {
  2727     Matrix temp;
  2728     m3gTranslationMatrix(&temp, tx, ty, tz);
  2729     m3gPreMultiplyMatrix(mtx, &temp);
  2730 }
  2731 
  2732 /*!
  2733  * \brief Converts a quaternion into a matrix
  2734  *
  2735  * The output is a matrix effecting the same rotation as the
  2736  * quaternion passed as input
  2737  */
  2738 M3G_API void m3gQuatMatrix(Matrix *mtx, const Quat *quat)
  2739 {
  2740     M3Gfloat qx = quat->x;
  2741     M3Gfloat qy = quat->y;
  2742     M3Gfloat qz = quat->z;
  2743     M3Gfloat qw = quat->w;
  2744 
  2745     /* Quick exit for identity rotations */
  2746 
  2747     if (IS_ZERO(qx) && IS_ZERO(qy) && IS_ZERO(qz)) {
  2748         m3gIdentityMatrix(mtx);
  2749         return;
  2750     }
  2751 
  2752     {
  2753         /* Determine the rough type of the output matrix */
  2754 
  2755         M3Guint type = MC_SCALING_ROTATION;
  2756         if (IS_ZERO(qz) && IS_ZERO(qy)) {
  2757             type = MC_X_ROTATION;
  2758         }
  2759         else if (IS_ZERO(qz) && IS_ZERO(qx)) {
  2760             type = MC_Y_ROTATION;
  2761         }
  2762         else if (IS_ZERO(qx) && IS_ZERO(qy)) {
  2763             type = MC_Z_ROTATION;
  2764         }
  2765         m3gClassifyAs(type, mtx);
  2766 
  2767         /* Generate the non-constant parts of the matrix */
  2768         {
  2769             M3Gfloat wx, wy, wz, xx, yy, yz, xy, xz, zz;
  2770 
  2771             xx = m3gMul(qx, qx);
  2772             xy = m3gMul(qx, qy);
  2773             xz = m3gMul(qx, qz);
  2774             yy = m3gMul(qy, qy);
  2775             yz = m3gMul(qy, qz);
  2776             zz = m3gMul(qz, qz);
  2777             wx = m3gMul(qw, qx);
  2778             wy = m3gMul(qw, qy);
  2779             wz = m3gMul(qw, qz);
  2780 
  2781             if (type != MC_X_ROTATION) {
  2782                 M44F(mtx, 0, 0) = m3gSub(1.f, m3gDouble(m3gAdd(yy, zz)));
  2783                 M44F(mtx, 0, 1) =             m3gDouble(m3gSub(xy, wz));
  2784                 M44F(mtx, 0, 2) =             m3gDouble(m3gAdd(xz, wy));
  2785             }
  2786 
  2787             if (type != MC_Y_ROTATION) {
  2788                 M44F(mtx, 1, 0) =             m3gDouble(m3gAdd(xy, wz));
  2789                 M44F(mtx, 1, 1) = m3gSub(1.f, m3gDouble(m3gAdd(xx, zz)));
  2790                 M44F(mtx, 1, 2) =             m3gDouble(m3gSub(yz, wx));
  2791             }
  2792 
  2793             if (type != MC_Z_ROTATION) {
  2794                 M44F(mtx, 2, 0) =             m3gDouble(m3gSub(xz, wy));
  2795                 M44F(mtx, 2, 1) =             m3gDouble(m3gAdd(yz, wx));
  2796                 M44F(mtx, 2, 2) = m3gSub(1.f, m3gDouble(m3gAdd(xx, yy)));
  2797             }
  2798         }
  2799         m3gSubClassify(mtx);
  2800     }
  2801 }
  2802 
  2803 /*!
  2804  * \brief Generates a scaling matrix
  2805  */
  2806 M3G_API void m3gScalingMatrix(
  2807     Matrix *mtx,
  2808     const M3Gfloat sx, const M3Gfloat sy, const M3Gfloat sz)
  2809 {
  2810     M3G_ASSERT_PTR(mtx);
  2811     M44F(mtx, 0, 0) = sx;
  2812     M44F(mtx, 1, 1) = sy;
  2813     M44F(mtx, 2, 2) = sz;
  2814     m3gClassifyAs(MC_SCALING, mtx);
  2815     m3gSubClassify(mtx);
  2816 }
  2817 
  2818 /*!
  2819  * \brief Sets a quaternion to represent an angle-axis rotation
  2820  */
  2821 M3G_API void m3gSetAngleAxis(Quat *quat,
  2822                      M3Gfloat angle,
  2823                      M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
  2824 {
  2825     m3gSetAngleAxisRad(quat, m3gMul(angle, M3G_DEG2RAD), ax, ay, az);
  2826 }
  2827 
  2828 /*!
  2829  * \brief Sets a quaternion to represent an angle-axis rotation
  2830  */
  2831 M3G_API void m3gSetAngleAxisRad(Quat *quat,
  2832                         M3Gfloat angleRad,
  2833                         M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
  2834 {
  2835     M3G_ASSERT_PTR(quat);
  2836     
  2837     if (!IS_ZERO(angleRad)) {
  2838         M3Gfloat s;
  2839         M3Gfloat halfAngle = m3gHalf(angleRad);
  2840 
  2841         s = m3gSin(halfAngle);
  2842         
  2843         {
  2844             M3Gfloat sqrNorm = m3gMadd(ax, ax, m3gMadd(ay, ay, m3gMul(az, az)));
  2845             if (sqrNorm < 0.995f || sqrNorm > 1.005f) {
  2846                 if (sqrNorm > EPSILON) {
  2847                     M3Gfloat ooNorm = m3gRcpSqrt(sqrNorm);
  2848                     ax = m3gMul(ax, ooNorm);
  2849                     ay = m3gMul(ay, ooNorm);
  2850                     az = m3gMul(az, ooNorm);
  2851                 }
  2852                 else {
  2853                     ax = ay = az = 0.0f;
  2854                 }
  2855             }
  2856         }
  2857 
  2858         quat->x = m3gMul(s, ax);
  2859         quat->y = m3gMul(s, ay);
  2860         quat->z = m3gMul(s, az);
  2861         quat->w = m3gCos(halfAngle);
  2862     }
  2863     else {
  2864         m3gIdentityQuat(quat);
  2865     }
  2866 }
  2867 
  2868 /*!
  2869  * \brief Quaternion multiplication.
  2870  */
  2871 M3G_API void m3gMulQuat(Quat *quat, const Quat *other)
  2872 {
  2873     Quat q;
  2874     q = *quat;
  2875 
  2876     quat->w = m3gMul(q.w, other->w) - m3gMul(q.x, other->x) - m3gMul(q.y, other->y) - m3gMul(q.z, other->z);
  2877     quat->x = m3gMul(q.w, other->x) + m3gMul(q.x, other->w) + m3gMul(q.y, other->z) - m3gMul(q.z, other->y);
  2878     quat->y = m3gMul(q.w, other->y) - m3gMul(q.x, other->z) + m3gMul(q.y, other->w) + m3gMul(q.z, other->x);
  2879     quat->z = m3gMul(q.w, other->z) + m3gMul(q.x, other->y) - m3gMul(q.y, other->x) + m3gMul(q.z, other->w);
  2880 }
  2881 
  2882 /*!
  2883  * \brief Makes this quaternion represent the rotation from one 3D
  2884  * vector to another
  2885  */
  2886 M3G_API void m3gSetQuatRotation(Quat *quat,
  2887                                 const Vec3 *from, const Vec3 *to)
  2888 {
  2889     M3Gfloat cosAngle;
  2890 
  2891     M3G_ASSERT_PTR(quat);
  2892     M3G_ASSERT_PTR(from);
  2893     M3G_ASSERT_PTR(to);
  2894 
  2895     cosAngle = m3gDot3(from, to);
  2896 
  2897     if (cosAngle > (1.0f - EPSILON)) {  /* zero angle */
  2898         m3gIdentityQuat(quat);
  2899         return;
  2900     }
  2901     else if (cosAngle > (1.0e-3f - 1.0f)) { /* normal case */
  2902         Vec3 axis;
  2903         m3gCross(&axis, from, to);
  2904         m3gSetAngleAxisRad(quat, m3gArcCos(cosAngle), axis.x, axis.y, axis.z);
  2905     }
  2906     else {
  2907 
  2908         /* Opposite vectors; must generate an arbitrary perpendicular
  2909          * vector and use that as the rotation axis. Here, we try the
  2910          * Z axis first, and if that seems too parallel to the
  2911          * vectors, project the Y axis instead: Z is the only good
  2912          * choice for Z-constrained rotations, and Y by definition
  2913          * must be perpendicular to that. */
  2914 
  2915         Vec3 axis, temp;
  2916         M3Gfloat s;
  2917 
  2918         axis.x = axis.y = axis.z = 0.0f;
  2919         if (m3gAbs(from->z) < (1.0f - EPSILON)) {
  2920             axis.z = 1.0f;
  2921         }
  2922         else {
  2923             axis.y = 1.0f;
  2924         }
  2925 
  2926         s = m3gDot3(&axis, from);
  2927         temp = *from;
  2928         m3gScaleVec3(&temp, s);
  2929         m3gSubVec3(&axis, &temp);
  2930 
  2931         m3gSetAngleAxis(quat, 180.f, axis.x, axis.y, axis.z);
  2932     }
  2933 }
  2934 
  2935 /*!
  2936  * \brief Sets the values of a matrix
  2937  *
  2938  */
  2939 M3G_API void m3gSetMatrixColumns(Matrix *mtx, const M3Gfloat *src)
  2940 {
  2941     M3G_ASSERT(mtx != NULL && src != NULL);
  2942 
  2943     m3gCopy(mtx->elem, src, sizeof(mtx->elem));
  2944     mtx->classified = M3G_FALSE;
  2945     mtx->complete = M3G_TRUE;
  2946 }
  2947 
  2948 /*!
  2949  * \brief Sets the values of a matrix
  2950  *
  2951  */
  2952 M3G_API void m3gSetMatrixRows(Matrix *mtx, const M3Gfloat *src)
  2953 {
  2954     M3G_ASSERT(mtx != NULL && src != NULL);
  2955     {
  2956         int row;
  2957         for (row = 0; row < 4; ++row) {
  2958             mtx->elem[ 0 + row] = *src++;
  2959             mtx->elem[ 4 + row] = *src++;
  2960             mtx->elem[ 8 + row] = *src++;
  2961             mtx->elem[12 + row] = *src++;
  2962         }
  2963     }
  2964     mtx->classified = M3G_FALSE;
  2965     mtx->complete = M3G_TRUE;
  2966 }
  2967 
  2968 /*!
  2969  * \brief Transforms a 4-vector with a matrix
  2970  */
  2971 #if defined(M3G_HW_FLOAT_VFPV2)
  2972 
  2973 __asm void _m3gTransformVec4(const Matrix *mtx, Vec4 *vec, M3Gint n)
  2974 {
  2975 
  2976 		CODE32
  2977 
  2978 		FSTMFDD	sp!, {d8-d11}
  2979 
  2980 		FMRX	r3, FPSCR		 
  2981 		BIC		r12, r3, #0x00370000
  2982 		ORR		r12, #(3<<16)
  2983 		FMXR	FPSCR, r12
  2984 		CMP		r2, #4
  2985 
  2986 		FLDMIAS	r0, {s4-s19}		// [mtx0  mtx1 ..]
  2987 		FLDMIAS	r1, {s0-s3}			// [vec.x  vec.y  vec.z  vec.w]
  2988 		FMULS	s20, s4, s0			// [s20-s23]  = [v.x*mtx0  v.x*mtx1  v.x*mtx2  v.x*mtx3 ]
  2989 		FMACS	s20, s8, s1			// [s20-s23] += [v.y*mtx4  v.y*mtx5  v.y*mtx6  v.y*mtx7 ]
  2990 		FMACS	s20, s12, s2		// [s20-s23] += [v.z*mtx8  v.z*mtx9  v.z*mtx10 v.z*mtx11]
  2991 		FMACS	s20, s16, s3		// [s20-s23] += [v.w*mtx12 v.w*mtx13 v.w*mtx14 v.w*mtx15]
  2992 		FSTMIAS		r1!, {s20-s22}
  2993 		FSTMIASEQ	r1, {s23}
  2994 
  2995 		FMXR	FPSCR, r3
  2996 		FLDMFDD	sp!, {d8-d11}
  2997 
  2998 		BX		lr
  2999 }
  3000 #endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
  3001 
  3002 M3G_API void m3gTransformVec4(const Matrix *mtx, Vec4 *vec)
  3003 {
  3004     M3Guint type;
  3005     M3G_ASSERT(mtx != NULL && vec != NULL);
  3006 
  3007     if (!m3gIsClassified(mtx)) {
  3008         m3gClassify((Matrix*)mtx);
  3009     }
  3010 
  3011     type = mtx->mask;
  3012 
  3013     if (type == MC_IDENTITY) {
  3014         return;
  3015     }
  3016     else {
  3017         int n = m3gIsWUnity(mtx) ? 3 : 4;
  3018 
  3019         if (!mtx->complete) {
  3020             m3gFillClassifiedMatrix((Matrix*)mtx);
  3021         }
  3022 #if	defined(M3G_HW_FLOAT_VFPV2)
  3023 		_m3gTransformVec4(mtx, vec, n);
  3024 #else
  3025         {
  3026             Vec4 v = *vec;
  3027             int i;
  3028 
  3029             for (i = 0; i < n; ++i) {
  3030                 M3Gfloat d = m3gMul(M44F(mtx, i, 0), v.x);
  3031                 d = m3gMadd(M44F(mtx, i, 1), v.y, d);
  3032                 d = m3gMadd(M44F(mtx, i, 2), v.z, d);
  3033                 d = m3gMadd(M44F(mtx, i, 3), v.w, d);
  3034                 (&vec->x)[i] = d;
  3035             }
  3036         }
  3037 #endif
  3038     }
  3039 }
  3040 
  3041 /*!
  3042  * \brief Generates a translation matrix
  3043  */
  3044 M3G_API void m3gTranslationMatrix(
  3045     Matrix *mtx,
  3046     const M3Gfloat tx, const M3Gfloat ty, const M3Gfloat tz)
  3047 {
  3048     M3G_ASSERT_PTR(mtx);
  3049     M44F(mtx, 0, 3) = tx;
  3050     M44F(mtx, 1, 3) = ty;
  3051     M44F(mtx, 2, 3) = tz;
  3052     m3gClassifyAs(MC_TRANSLATION, mtx);
  3053     m3gSubClassify(mtx);
  3054 }
  3055 
  3056 /*!
  3057  * \brief Float vector assignment.
  3058  */
  3059 M3G_API void m3gSetQuat(Quat *quat, const M3Gfloat *vec)
  3060 {
  3061     quat->x = vec[0];
  3062     quat->y = vec[1];
  3063     quat->z = vec[2];
  3064     quat->w = vec[3];
  3065 }
  3066 
  3067 /*!
  3068  * \brief Slerp between quaternions q0 and q1, storing the result in quat.
  3069  */
  3070 M3G_API void m3gSlerpQuat(Quat *quat,
  3071                   M3Gfloat s,
  3072                   const Quat *q0, const Quat *q1)
  3073 {
  3074     M3Gfloat s0, s1;
  3075     M3Gfloat cosTheta = m3gDot4((const Vec4 *)q0, (const Vec4 *)q1);
  3076     M3Gfloat oneMinusS = m3gSub(1.0f, s);
  3077 
  3078     if (cosTheta > EPSILON - 1.0f) {
  3079         if (cosTheta < 1.0f - EPSILON) {
  3080             M3Gfloat theta    = m3gArcCos(cosTheta);
  3081             M3Gfloat sinTheta = m3gSin(theta);
  3082             s0 = m3gSin(m3gMul(oneMinusS, theta)) / sinTheta;
  3083             s1 = m3gSin(m3gMul(s, theta)) / sinTheta;
  3084         }
  3085         else {
  3086             /* For quaternions very close to each other, use plain
  3087                linear interpolation for numerical stability. */
  3088             s0 = oneMinusS;
  3089             s1 = s;
  3090         }
  3091         quat->x = m3gMadd(s0, q0->x, m3gMul(s1, q1->x));
  3092         quat->y = m3gMadd(s0, q0->y, m3gMul(s1, q1->y));
  3093         quat->z = m3gMadd(s0, q0->z, m3gMul(s1, q1->z));
  3094         quat->w = m3gMadd(s0, q0->w, m3gMul(s1, q1->w));
  3095     }
  3096     else {
  3097         /* Slerp is undefined if the two quaternions are (nearly)
  3098            opposite, so we just pick an arbitrary plane of
  3099            rotation here. */
  3100 
  3101         quat->x = -q0->y;
  3102         quat->y =  q0->x;
  3103         quat->z = -q0->w;
  3104         quat->w =  q0->z;
  3105 
  3106         s0 = m3gSin(m3gMul(oneMinusS, HALF_PI));
  3107         s1 = m3gSin(m3gMul(s, HALF_PI));
  3108 
  3109         quat->x = m3gMadd(s0, q0->x, m3gMul(s1, quat->x));
  3110         quat->y = m3gMadd(s0, q0->y, m3gMul(s1, quat->y));
  3111         quat->z = m3gMadd(s0, q0->z, m3gMul(s1, quat->z));
  3112     }
  3113 }
  3114 
  3115 /*!
  3116  * \brief Interpolate quaternions using spline, or "squad" interpolation.
  3117  */
  3118 M3G_API void m3gSquadQuat(Quat *quat,
  3119                   M3Gfloat s,
  3120                   const Quat *q0, const Quat *a, const Quat *b, const Quat *q1)
  3121 {
  3122 
  3123     Quat temp0;
  3124     Quat temp1;
  3125     m3gSlerpQuat(&temp0, s, q0, q1);
  3126     m3gSlerpQuat(&temp1, s, a, b);
  3127 
  3128     m3gSlerpQuat(quat, m3gDouble(m3gMul(s, m3gSub(1.0f, s))), &temp0, &temp1);
  3129 }
  3130 
  3131 
  3132