diff -r 000000000000 -r bde4ae8d615e os/graphics/m3g/m3gcore11/src/m3g_math.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/os/graphics/m3g/m3gcore11/src/m3g_math.c Fri Jun 15 03:10:57 2012 +0200 @@ -0,0 +1,3132 @@ +/* +* Copyright (c) 2003 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of the License "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: Internal math function implementations +* +*/ + + +/*! + * \internal + * \file + * \brief Internal math function implementations + * + */ + +#ifndef M3G_CORE_INCLUDE +# error included by m3g_core.c; do not compile separately. +#endif + +#include "m3g_defs.h" +#include "m3g_memory.h" + +#if defined(M3G_SOFT_FLOAT) +#include +#endif + +/*---------------------------------------------------------------------- + * Private types and definitions + *--------------------------------------------------------------------*/ + +/* Enumerated common matrix classifications */ +#define MC_IDENTITY 0x40100401 // 01000000 00010000 00000100 00000001 +#define MC_FRUSTUM 0x30BF0C03 // 00110000 10111111 00001100 00000011 +#define MC_PERSPECTIVE 0x30B00C03 // 00110000 10110000 00001100 00000011 +#define MC_ORTHO 0x7F300C03 // 01111111 00110000 00001100 00000011 +#define MC_PARALLEL 0x70300C03 // 01110000 00110000 00001100 00000011 +#define MC_SCALING_ROTATION 0x403F3F3F // 01000000 00111111 00111111 00111111 +#define MC_SCALING 0x40300C03 // 01000000 00110000 00001100 00000011 +#define MC_TRANSLATION 0x7F100401 // 01111111 00010000 00000100 00000001 +#define MC_X_ROTATION 0x403C3C01 // 01000000 00111100 00111100 00000001 +#define MC_Y_ROTATION 0x40330433 // 01000000 00110011 00000100 00110011 +#define MC_Z_ROTATION 0x40100F0F // 01000000 00010000 00001111 00001111 +#define MC_W_UNITY 0x7F3F3F3F // 01111111 00111111 00111111 00111111 +#define MC_GENERIC 0xFFFFFFFF + +/* Partial masks for individual matrix components */ +#define MC_TRANSLATION_PART 0x3F000000 // 00111111 00000000 00000000 00000000 +#define MC_SCALE_PART 0x00300C03 // 00000000 00110000 00001100 00000011 +#define MC_SCALE_ROTATION_PART 0x003F3F3F // 00000000 00111111 00111111 00111111 + +/* Matrix element classification masks */ +#define ELEM_ZERO 0x00 +#define ELEM_ONE 0x01 +#define ELEM_MINUS_ONE 0x02 +#define ELEM_ANY 0x03 + +/*! + * \internal + * \brief calculates the offset of a 4x4 matrix element in a linear + * array + * + * \notes The current convention is column-major, as in OpenGL ES + */ +#define MELEM(row, col) ((row) + ((col) << 2)) + +/*! + * \internal + * \brief Macro for accessing 4x4 float matrix elements + * + * \param mtx pointer to the first element of the matrix + * \param row matrix row + * \param col matrix column + */ +#define M44F(mtx, row, col) ((mtx)->elem[MELEM((row), (col))]) + +/*--------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------- + * Private functions + *--------------------------------------------------------------------*/ + + +/*! + * \internal + * \brief ARM VFPv2 implementation of 4x4 matrix multiplication. + * + * \param dst multiplication result + * \param left left-hand matrix + * \param right right-hand matrix + */ +#if defined(M3G_HW_FLOAT_VFPV2) + +__asm void _m3gGenericMatrixProduct(Matrix *dst, + const Matrix *left, const Matrix *right) +{ +// r0 = *dst +// r1 = *left +// r2 = *right + + CODE32 + +// save the VFP registers and set the vector STRIDE = 1, LENGTH = 4 + + FSTMFDD sp!, {d8-d15} + + FMRX r3, FPSCR + BIC r12, r3, #0x00370000 + ORR r12, #0x00030000 + FMXR FPSCR, r12 + +// left = [a0 a1 a2 a3 right = [b0 b1 b2 b3 +// a4 a5 a6 a7 b4 b5 b6 b7 +// a8 a9 a10 a11 b8 b9 b10 b11 +// a12 a13 a14 a15] b12 b13 b14 b15] +// +// dst = [a0*b0+a4*b1+a8*b2+a12*b3 a1*b0+a5*b1+a9*b2+a13*b3 .. +// a0*b4+a4*b5+a8*b6+a12*b7 a1*b4+a5*b5+a9*b6+a13*b7 .. +// . . +// . . + + FLDMIAS r1!, {s8-s23} // load the left matrix [a0-a15] to registers s8-s23 + FLDMIAS r2!, {s0-s7} // load [b0-b7] of right matrix to registers s0-s7 + FMULS s24, s8, s0 // [s24-s27] = [a0*b0 a1*b0 a2*b0 a3*b0] + FMULS s28, s8, s4 // [s28-s31] = [a0*b4 a1*b4 a2*b4 a3*b4] + FMACS s24, s12, s1 // [s24-s27] += [a4*b1 a5*b1 a6*b1 a7*b1] + FMACS s28, s12, s5 // [s28-s31] += [a4*b5 a5*b5 a6*b5 a7*b5] + FMACS s24, s16, s2 // [s24-s27] += [a8*b2 a9*b2 a10*b2 a11*b2] + FMACS s28, s16, s6 // [s28-s31] += [a8*b6 a9*b6 a10*b6 a11*b6] + FMACS s24, s20, s3 // [s24-s27] += [a12*b3 a13*b3 a14*b3 a15*b3] + FMACS s28, s20, s7 // [s28-s31] += [a12*b7 a13*b37a14*b7 a15*b7] + FLDMIAS r2!, {s0-s7} // load [b8-b15] + FSTMIAS r0!, {s24-s31} // write [dst0-dst7] + FMULS s24, s8, s0 + FMULS s28, s8, s4 + FMACS s24, s12, s1 + FMACS s28, s12, s5 + FMACS s24, s16, s2 + FMACS s28, s16, s6 + FMACS s24, s20, s3 + FMACS s28, s20, s7 + FSTMIAS r0!, {s24-s31} + +// Restore the VFP registers and return. + + FMXR FPSCR, r3 + + FLDMFDD sp!, {d8-d15} + + BX lr + +} +#endif /* #if defined(M3G_HW_FLOAT_VFPV2) */ + + +/*------------------ Elementary float ------------------*/ + +#if defined(M3G_SOFT_FLOAT) + +#if defined (M3G_BUILD_ARM) + +/*! + * \internal + * \brief Floating point multiplication implementation for ARM. + */ +__asm M3Gfloat m3gMul(const M3Gfloat a, + const M3Gfloat b) +{ + /** + * Extract the exponents of the multiplicands and add them + * together. Flush to zero if either exponent or their sum + * is zero. + */ + + mov r12, #0xff; + ands r2, r0, r12, lsl #23; // exit if e1 == 0 + andnes r3, r1, r12, lsl #23; // exit if e2 == 0 + subne r2, r2, #(127 << 23); + addnes r12, r2, r3; // exit if e1+e2-127 <= 0 + movle r0, #0; + bxle lr; + + /** + * Determine the sign of the result. Note that the exponent + * may have overflowed to the sign bit, and thus the result + * may be an arbitrary negative value when it really should + * be +Inf or -Inf. + */ + + teq r0, r1; + orrmi r12, r12, #0x80000000; + + /** + * Multiply the mantissas. First shift the mantissas up to + * unsigned 1.31 fixed point, adding the leading "1" bit at + * the same time, and finally do a 32x32 -> 64 bit unsigned + * multiplication. The result is in unsigned 2.62 fixed point, + * representing the interval [1.0, 4.0). + */ + + mov r2, #0x80000000; + orr r0, r2, r0, lsl #8; + orr r1, r2, r1, lsl #8; + umulls r2, r3, r0, r1; + + /** + * If the highest bit of the 64-bit result is set, then the + * mantissa lies in [2.0, 4.0) and needs to be renormalized. + * That is, the mantissa is shifted one bit to the right and + * the exponent correspondingly increased by 1. Note that we + * lose the leading "1" bit from the mantissa by adding it up + * with the exponent. + */ + + subpl r12, r12, #(1 << 23); // no overflow: exponent -= 1 + addpl r0, r12, r3, lsr #7; // no overflow: exponent += 1 + addmi r0, r12, r3, lsr #8; // overflow: exponent += 1 + bx lr; +} + +/*! + * \internal + * \brief Floating point addition implementation for ARM. + */ +__asm M3Gfloat m3gAdd(const M3Gfloat a, + const M3Gfloat b) +{ + /** + * If the operands have opposite signs then this is not really + * an addition but a subtraction. Subtraction is much slower, + * so we have a separate code path for it, rather than trying + * to save space by handling both in the same place. + */ + + teq r0, r1; + bmi _m3gSub; + + /** + * Sort the operands such that the larger operand is in R0 and + * the smaller in R1. The sign bits do not affect the ordering, + * since they are known to be equal. + */ + + subs r2, r0, r1; + submi r0, r0, r2; + addmi r1, r1, r2; + + /** + * Extract the exponent of the smaller operand into R2 and compute + * the difference between the larger and smaller exponent into R3. + * (Note that the sign bits cancel out in the subtraction.) The + * exponent delta tells how many bits the mantissa of the smaller + * operand must be shifted to the right in order to bring the + * operands into equal scale. + */ + + mov r2, r1, lsr #23; + rsb r3, r2, r0, lsr #23; + + /** + * Check if the exponent delta is bigger than 23 bits, or if the + * smaller exponent is zero. In either case, exit the routine and + * return the larger operand (which is already in R0). Note that + * this means that subnormals are treated as zero. + */ + + rsbs r12, r3, #23; // N set, V clear if R3 > 23 + tstpl r2, #0xff; // execute only if R3 <= 23 + bxle lr; // exit if Z set or N != V + + /** + * Extract the mantissas and shift them up to unsigned 1.31 fixed + * point, inserting the implied leading "1" bit at the same time. + * Finally, align the decimal points and add up the mantissas. + */ + + mov r12, #0x80000000; + orr r0, r12, r0, lsl #8; + orr r1, r12, r1, lsl #8; + adds r0, r0, r1, lsr r3; + + /** + * Compute the final exponent by adding up the smaller exponent + * (R2), the exponent delta (R3), and the possible overflow bit + * (carry flag). Note that in case of overflow, the leading "1" + * has ended up in the carry flag and thus needs not be explicitly + * discarded. Finally, put the mantissa together with the sign and + * exponent. + */ + + adc r2, r2, r3; // r2 = smallExp + deltaExp + overflow + movcc r0, r0, lsl #1; // no overflow: discard leading 1 + mov r0, r0, lsr #9; + orr r0, r0, r2, lsl #23; + bx lr; + +_m3gSub + + /** + * Sort the operands such that the one with larger magnitude is + * in R0 and has the correct sign (the sign of the final result), + * while the smaller operand is in R1 with an inverted sign bit. + */ + + eor r1, r1, #0x80000000; + subs r2, r0, r1; + eormi r2, r2, #0x80000000; + submi r0, r0, r2; + addmi r1, r1, r2; + + /** + * Extract the exponent of the smaller operand into R2 and compute + * the difference between the larger and smaller exponent into R3. + * (Note that the sign bits cancel out in the subtraction.) The + * exponent delta tells how many bits the mantissa of the smaller + * operand must be shifted to the right in order to bring the + * operands into equal scale. + */ + + mov r2, r1, lsr #23; + rsbs r3, r2, r0, lsr #23; + + /** + * Check if the exponent delta is bigger than 31 bits, or if the + * smaller exponent is zero. In either case, exit the routine and + * return the larger operand (which is already in R0). Note that + * this means that subnormals are treated as zero. + */ + + rsbs r12, r3, #31; // N set, V clear if R3 > 31 + tstpl r2, #0xff; // execute only if R3 <= 31 + bxle lr; // exit if Z set or N != V + + /** + * Extract the mantissas and shift them up to unsigned 1.31 fixed + * point, inserting the implied leading "1" bit at the same time. + * Then align the decimal points and subtract the smaller mantissa + * from the larger one. + */ + + mov r12, #0x80000000; + orr r0, r12, r0, lsl #8; + orr r1, r12, r1, lsl #8; + subs r0, r0, r1, lsr r3; + + /** + * We split the range of possible results into three categories: + * + * 1. [1.0, 2.0) ==> no renormalization needed (highest bit set) + * 2. [0.5, 1.0) ==> only one left-shift needed + * 3. (0.0, 0.5) ==> multiple left-shifts needed + * 4. zero ==> just return + * + * Cases 1 and 2 are handled in the main code path. Cases 3 and 4 + * are less common by far, so we branch to a separate code fragment + * for those. + */ + + movpls r0, r0, lsl #1; // Cases 2,3,4: shift left + bpl _m3gSubRenormalize; // Cases 3 & 4: branch out + + /** + * Now we have normalized the mantissa such that the highest bit + * is set. Here we only need to adjust the exponent, if necessary, + * and put the pieces together. Note that we lose the leading "1" + * bit from the mantissa by adding it up with the exponent. We can + * also do proper rounding (towards nearest) instead of truncation + * (towards zero) at no extra cost! + */ + + sbc r3, r3, #1; // deltaExp -= 1 (Case 1) or 2 (Case 2) + add r2, r2, r3; // resultExp = smallExp + deltaExp + movs r0, r0, lsr #8; // shift mantissa, keep leading "1" + adc r0, r0, r2, lsl #23; // resultExp += 1, mantissa += carry + bx lr; + + /** + * Separate code path for cases 3 and 4 (see above). The mantissa + * has already been shifted up by one, but the exponent has not + * been correspondingly decreased. We also know that the highest + * bit is still not set, and that the carry flag is clear. + */ + +_m3gSubRenormalize + bxeq lr; + subcc r3, r3, #2; + movccs r0, r0, lsl #2; + + /** + * If the carry flag is still not set, i.e. there were more than + * two leading zeros in the mantissa initially, loop until we + * find the highest set bit. + */ + +_m3gSubRenormalizeLoop + subcc r3, r3, #1; + movccs r0, r0, lsl #1; + bcc _m3gSubRenormalizeLoop; + + /** + * Now the leading "1" is in the carry flag, so we can just add up + * the exponent and mantissa as usual, doing proper rounding at + * the same time. However, cases where the exponent goes negative + * (that is, underflows) must be detected and flushed to zero. + */ + + add r3, r2, r3; + movs r0, r0, lsr #9; + adc r0, r0, r3, lsl #23; + teq r0, r2, lsl #23; + movmi r0, #0; + bx lr; +} + +#else /* M3G_BUILD_ARM */ + +/*! + * \internal + * \brief Floating point addition implementation + * + */ +static M3G_INLINE M3Gfloat m3gFloatAdd(const M3Guint aBits, + const M3Guint bBits) +{ + M3Guint large, small, signMask; + + /* Early exits for underflow cases */ + + large = (M3Gint)(aBits & ~SIGN_MASK); + if (large <= 0x00800000) { + return INT_AS_FLOAT(bBits); + } + small = (M3Gint)(bBits & ~SIGN_MASK); + if (small <= 0x00800000) { + return INT_AS_FLOAT(aBits); + } + + /* Swap the numbers so that "large" really is larger; the unsigned + * (or de-signed) bitmasks for floats are nicely monotonous, so we + * can compare directly. Also store the sign of the larger number + * for future reference. */ + + if (small > large) { + M3Gint temp = small; + small = large; + large = temp; + signMask = (bBits & SIGN_MASK); + } + else { + signMask = (aBits & SIGN_MASK); + } + + { + M3Guint res, overflow; + M3Guint resExp, expDelta; + + /* Store the larger exponent as our candidate result exponent, + * and compute the difference between the exponents */ + + resExp = (large >> 23); + expDelta = resExp - (small >> 23); + + /* Take an early exit if the change would be insignificant; + * this also guards against odd results from shifting by more + * than 31 (undefined in C) */ + + if (expDelta >= 24) { + res = large | signMask; + return INT_AS_FLOAT(res); + } + + /* Convert the mantissas into shifted integers, and shift the + * smaller number to the same scale with the larger one. */ + + large = (large & MANTISSA_MASK) | LEADING_ONE; + small = (small & MANTISSA_MASK) | LEADING_ONE; + small >>= expDelta; + M3G_ASSERT(large >= small); + + /* Check whether we're really adding or subtracting the + * smaller number, and branch to slightly different routines + * respectively */ + + if (((aBits ^ bBits) & SIGN_MASK) == 0) { + + /* Matching signs; just add the numbers and check for + * overflow, shifting to compensate as necessary. */ + + res = large + small; + + overflow = (res >> 24); + res >>= overflow; + resExp += overflow; + } + else { + + /* Different signs, so let's subtract the smaller value; + * also check that we're not subtracting a number from + * itself (so we don't have to normalize a zero below) */ + + if (small == large) { + return 0.0f; /* x - x = 0 */ + } + + res = (large << 8) - (small << 8); + + /* Renormalize the number by shifting until we've got the + * high bit in place */ + + while ((res >> 24) == 0) { + res <<= 8; + resExp -= 8; + } + while ((res >> 31) == 0) { + res <<= 1; + --resExp; + } + res >>= 8; + } + + /* Flush to zero in case of over/underflow of the exponent */ + + if (resExp >= 255) { + return 0.0f; + } + + /* Compose the final number into "res"; note that we pull in + * the sign of the original larger number, which should still + * be valid */ + + res &= MANTISSA_MASK; + res |= (resExp << 23); + res |= signMask; + + return INT_AS_FLOAT(res); + } +} + +/*! + * \internal + * \brief Floating point multiplication implementation + */ +static M3G_INLINE M3Gfloat m3gFloatMul(const M3Guint aBits, + const M3Guint bBits) +{ + M3Guint a, b; + + /* Early exits for underflow and multiplication by zero */ + + a = (aBits & ~SIGN_MASK); + if (a <= 0x00800000) { + return 0.0f; + } + + b = (bBits & ~SIGN_MASK); + if (b <= 0x00800000) { + return 0.0f; + } + + { + M3Guint res, exponent, overflow; + + /* Compute the exponent of the result, assuming the mantissas + * don't overflow; then mask out the original exponents */ + + exponent = (a >> 23) + (b >> 23) - 127; + a &= MANTISSA_MASK; + b &= MANTISSA_MASK; + + /* Compute the new mantissa from: + * + * (1 + a)(1 + b) = ab + a + b + 1 + * + * First shift the mantissas from 0.23 down to 0.16 for the + * multiplication, then shift back to 0.23 for adding in the + * "a + b + 1" part of the equation. */ + + res = (a >> 7) * (b >> 7); /* "ab" at 0.32 */ + res = (res >> 9) + a + b + LEADING_ONE; + + /* Add the leading one, then normalize the result by checking + * the overflow bit and dividing by two if necessary */ + + overflow = (res >> 24); + res >>= overflow; + exponent += overflow; + + /* Flush to zero in case of over/underflow of the exponent */ + + if (exponent >= 255) { + return 0.0f; + } + + /* Compose the final number into "res" */ + + res &= MANTISSA_MASK; + res |= (exponent << 23); + res |= (aBits ^ bBits) & SIGN_MASK; + + return INT_AS_FLOAT(res); + } +} + +#endif /* M3G_BUILD_ARM */ + +/*! + * \internal + * \brief Computes the signed fractional part of a floating point + * number + * + * \param x floating point value + * \return x signed fraction of x in ]-1, 1[ + */ +static M3Gfloat m3gFrac(M3Gfloat x) +{ + /* Quick exit for -1 < x < 1 */ + + if (m3gAbs(x) < 1.0f) { + return x; + } + + /* Shift the mantissa to the proper place, mask out the integer + * part, and renormalize */ + { + M3Guint ix = FLOAT_AS_UINT(x); + M3Gint expo = ((ix >> 23) & 0xFF) - 127; + M3G_ASSERT(expo >= 0); + + /* The decimal part will always be zero for large values with + * exponents over 24 */ + + if (expo >= 24) { + return 0.f; + } + else { + + /* Shift the integer part out of the mantissa and see what + * we have left */ + + M3Guint base = (ix & MANTISSA_MASK) | LEADING_ONE; + base = (base << expo) & MANTISSA_MASK; + + /* Quick exit (and guard against infinite looping) for + * zero */ + + if (base == 0) { + return 0.f; + } + + /* We now have an exponent of 0 (i.e. no shifting), but + * must renormalize to get a set bit in place of the + * hidden (implicit one) bit */ + + expo = 0; + + while ((base >> 19) == 0) { + base <<= 4; + expo -= 4; + } + while ((base >> 23) == 0) { + base <<= 1; + --expo; + } + + /* Compose the final number */ + + ix = + (base & MANTISSA_MASK) | + ((expo + 127) << 23) | + (ix & SIGN_MASK); + return INT_AS_FLOAT(ix); + } + } +} + +#endif /* M3G_SOFT_FLOAT */ + +#if defined(M3G_DEBUG) +/*! + * \internal + * \brief Checks for NaN or infinity in a floating point input + */ +static void m3gValidateFloats(int n, float *p) +{ + while (n-- > 0) { + M3G_ASSERT(EXPONENT(*p) < 120); + ++p; + } +} +#else +# define m3gValidateFloats(n, p) +#endif + +/*------------------ Trigonometry and exp ----------*/ + + +#if defined(M3G_SOFT_FLOAT) +/*! + * \internal + * \brief Sine for the first quadrant + * + * \param x floating point value in [0, PI/2] + * \return sine of \c x + */ +static M3Gfloat m3gSinFirstQuadrant(M3Gfloat x) +{ + M3Guint bits = FLOAT_AS_UINT(x); + + if (bits <= 0x3ba3d70au) /* 0.005f */ + return x; + else { + static const M3Gfloat sinTermLut[4] = { + -1.0f / (2*3), + -1.0f / (4*5), + -1.0f / (6*7), + -1.0f / (8*9) + }; + + M3Gfloat xx = m3gSquare(x); + M3Gfloat sinx = x; + int i; + + for (i = 0; i < 4; ++i) { + x = m3gMul(x, m3gMul(xx, sinTermLut[i])); + sinx = m3gAdd(sinx, x); + } + + return sinx; + } +} +#endif /* M3G_SOFT_FLOAT */ + +#if defined(M3G_SOFT_FLOAT) +/*! + * \internal + * \brief Computes sine for the first period + * + * \param x floating point value in [0, 2PI] + * \return sine of x + */ +static M3Gfloat m3gSinFirstPeriod(const M3Gfloat x) +{ + M3G_ASSERT(x >= 0 && x <= TWO_PI); + + if (x >= PI) { + return m3gNegate(m3gSinFirstQuadrant(x >= ONE_AND_HALF_PI ? + m3gSub(TWO_PI, x) : + m3gSub(x, PI))); + } + return m3gSinFirstQuadrant((x >= HALF_PI) ? m3gSub(PI, x) : x); +} +#endif /* M3G_SOFT_FLOAT */ + +/*------------- Float vs. int conversion helpers -------------*/ + +/*! + * \internal + * \brief Scales and clamps a float to unsigned byte range + */ +static M3G_INLINE M3Gint m3gFloatToByte(const M3Gfloat a) +{ + return m3gRoundToInt(m3gMul(255.f, m3gClampFloat(a, 0.f, 1.f))); +} + +/*------------------ Vector helpers ------------------*/ + +/*! + * \internal + * \brief Computes the norm of a floating-point 3-vector + */ +static M3G_INLINE M3Gfloat m3gNorm3(const M3Gfloat *v) +{ + return m3gAdd(m3gAdd(m3gSquare(v[0]), m3gSquare(v[1])), + m3gSquare(v[2])); +} + +/*! + * \internal + * \brief Computes the norm of a floating-point 4-vector + */ +static M3G_INLINE M3Gfloat m3gNorm4(const M3Gfloat *v) +{ + return m3gAdd(m3gAdd(m3gSquare(v[0]), m3gSquare(v[1])), + m3gAdd(m3gSquare(v[2]), m3gSquare(v[3]))); +} + +/*! + * \internal + * \brief Scales a floating-point 3-vector + */ +static M3G_INLINE void m3gScale3(M3Gfloat *v, M3Gfloat s) +{ + v[0] = m3gMul(v[0], s); + v[1] = m3gMul(v[1], s); + v[2] = m3gMul(v[2], s); +} + +/*! + * \internal + * \brief Scales a floating-point 4-vector + */ +static M3G_INLINE void m3gScale4(M3Gfloat *v, M3Gfloat s) +{ + v[0] = m3gMul(v[0], s); + v[1] = m3gMul(v[1], s); + v[2] = m3gMul(v[2], s); + v[3] = m3gMul(v[3], s); +} + + +/*------------------ Matrices ------------------*/ + +/*! + * \internal + */ +static M3G_INLINE M3Gbool m3gIsClassified(const Matrix *mtx) +{ + M3G_ASSERT(mtx != NULL); + return (M3Gbool) mtx->classified; +} + +/*! + * \internal + * \brief Returns a classification for a single floating point number + */ +static M3G_INLINE M3Guint m3gElementClass(const M3Gfloat x) +{ + if (IS_ZERO(x)) { + return ELEM_ZERO; + } + else if (IS_ONE(x)) { + return ELEM_ONE; + } + else if (IS_MINUS_ONE(x)) { + return ELEM_MINUS_ONE; + } + return ELEM_ANY; +} + +/*! + * \internal + * \brief Computes the classification mask of a matrix + * + * The mask is constructed from two bits per elements, with the lowest + * two bits corresponding to the first element in the \c elem array of + * the matrix. + */ +static void m3gClassify(Matrix *mtx) +{ + M3Guint mask = 0; + const M3Gfloat *p; + int i; + + M3G_ASSERT(mtx != NULL); + M3G_ASSERT(!m3gIsClassified(mtx)); + + p = mtx->elem; + for (i = 0; i < 16; ++i) { + M3Gfloat elem = *p++; + mask |= (m3gElementClass(elem) << (i*2)); + } + mtx->mask = mask; + mtx->classified = M3G_TRUE; +} + +/*! + * \internal + * \brief Sets matrix classification directly + */ +static M3G_INLINE void m3gClassifyAs(M3Guint mask, Matrix *mtx) +{ + M3G_ASSERT(mtx != NULL); + mtx->mask = mask; + mtx->classified = M3G_TRUE; + mtx->complete = M3G_FALSE; +} + +/*! + * \internal + * \brief Attempts to classify a matrix more precisely + * + * Tries to classify all "free" elements of a matrix into one of the + * predefined constants to improve precision and performance in + * subsequent calculations. + */ +static void m3gSubClassify(Matrix *mtx) +{ + M3G_ASSERT_PTR(mtx); + M3G_ASSERT(m3gIsClassified(mtx)); + { + const M3Gfloat *p = mtx->elem; + M3Guint inMask = mtx->mask; + M3Guint outMask = inMask; + int i; + + for (i = 0; i < 16; ++i, inMask >>= 2) { + M3Gfloat elem = *p++; + if ((inMask & 0x03) == ELEM_ANY) { + outMask &= ~(0x03u << (i*2)); + outMask |= (m3gElementClass(elem) << (i*2)); + } + } + mtx->mask = outMask; + } +} + +/*! + * \internal + * \brief Fills in the implicit values for a classified matrix + */ +static void m3gFillClassifiedMatrix(Matrix *mtx) +{ + int i; + M3Guint mask; + M3Gfloat *p; + + M3G_ASSERT(mtx != NULL); + M3G_ASSERT(mtx->classified); + M3G_ASSERT(!mtx->complete); + + mask = mtx->mask; + p = mtx->elem; + + for (i = 0; i < 16; ++i, mask >>= 2) { + unsigned elem = (mask & 0x03); + switch (elem) { + case ELEM_ZERO: *p++ = 0.0f; break; + case ELEM_ONE: *p++ = 1.0f; break; + case ELEM_MINUS_ONE: *p++ = -1.0f; break; + default: ++p; + } + } + mtx->complete = M3G_TRUE; +} + + +#if !defined(M3G_HW_FLOAT) +/*! + * \internal + * \brief Performs one multiply-add of classified matrix elements + * + * \param amask element class of the first multiplicand + * \param a float value of the first multiplicand + * \param bmask element class of the second multiplicand + * \param b float value of the second multiplicand + * \param c float value to add + * \return a * b + c + * + * \notes inline, as only called from the matrix product function + */ +static M3G_INLINE M3Gfloat m3gClassifiedMadd( + const M3Gbitmask amask, const M3Gfloat *pa, + const M3Gbitmask bmask, const M3Gfloat *pb, + const M3Gfloat c) +{ + /* Check for zero product to reduce the switch cases below */ + + if (amask == ELEM_ZERO || bmask == ELEM_ZERO) { + return c; + } + + /* Branch based on the classification of a */ + + switch (amask) { + + case ELEM_ANY: + if (bmask == ELEM_ONE) { + return m3gAdd(*pa, c); /* a * 1 + c = a + c */ + } + if (bmask == ELEM_MINUS_ONE) { + return m3gSub(c, *pa); /* a * -1 + c = -a + c */ + } + return m3gMadd(*pa, *pb, c); /* a * b + c */ + + case ELEM_ONE: + if (bmask == ELEM_ONE) { + return m3gAdd(c, 1.f); /* 1 * 1 + c = 1 + c */ + } + if (bmask == ELEM_MINUS_ONE) { + return m3gSub(c, 1.f); /* 1 * -1 + c = -1 + c */ + } + return m3gAdd(*pb, c); /* 1 * b + c = b + c */ + + case ELEM_MINUS_ONE: + if (bmask == ELEM_ONE) { + return m3gSub(c, 1.f); /* -1 * 1 + c = -1 + c */ + } + if (bmask == ELEM_MINUS_ONE) { + return m3gAdd(c, 1.f); /* -1 * -1 + c = 1 + c */ + } + return m3gSub(c, *pb); /* -1 * b + c = -b + c */ + + default: + M3G_ASSERT(M3G_FALSE); + return 0.0f; + } +} +#endif /*!defined(M3G_HW_FLOAT)*/ + +/*! + * \internal + * \brief Computes a generic 4x4 matrix product + */ +static void m3gGenericMatrixProduct(Matrix *dst, + const Matrix *left, const Matrix *right) +{ + M3G_ASSERT(dst != NULL && left != NULL && right != NULL); + + { +# if defined(M3G_HW_FLOAT) + if (!left->complete) { + m3gFillClassifiedMatrix((Matrix*)left); + } + if (!right->complete) { + m3gFillClassifiedMatrix((Matrix*)right); + } +# else + const unsigned lmask = left->mask; + const unsigned rmask = right->mask; +# endif + +#if defined(M3G_HW_FLOAT_VFPV2) + _m3gGenericMatrixProduct(dst, left, right); +#else + { + int row; + + for (row = 0; row < 4; ++row) { + int col; + for (col = 0; col < 4; ++col) { + int k; + M3Gfloat a = 0; + for (k = 0; k < 4; ++k) { + M3Gint lidx = MELEM(row, k); + M3Gint ridx = MELEM(k, col); +# if defined(M3G_HW_FLOAT) + a = m3gMadd(left->elem[lidx], right->elem[ridx], a); +# else + a = m3gClassifiedMadd((lmask >> (2 * lidx)) & 3, + &left->elem[lidx], + (rmask >> (2 * ridx)) & 3, + &right->elem[ridx], + a); +# endif /*!M3G_HW_FLOAT*/ + } + M44F(dst, row, col) = a; + } + } + } +#endif /*!M3G_HW_FLOAT_VFPV2*/ + } + dst->complete = M3G_TRUE; + dst->classified = M3G_FALSE; +} + +/*! + * \internal + * \brief Converts a float vector to 16-bit integers + * + * \param size vector length + * \param vec pointer to float vector + * \param scale scale of output values, as the number of bits to + * shift left to get actual values + * \param outVec output value vector + */ +static void m3gFloatVecToShort(M3Gint size, const M3Gfloat *vec, + M3Gint scale, M3Gshort *outVec) +{ + const M3Guint *vecInt = (const M3Guint*) vec; + M3Gint i; + + for (i = 0; i < size; ++i) { + M3Guint a = vecInt[i]; + if ((a & ~SIGN_MASK) < (1 << 23)) { + *outVec++ = 0; + } + else { + M3Gint shift = + scale - ((M3Gint)((vecInt[i] >> 23) & 0xFFu) - (127 + 23)); + M3G_ASSERT(shift > 8); /* or the high bits will overflow */ + + if (shift > 23) { + *outVec++ = 0; + } + else { + M3Gint out = + (M3Gint) (((a & MANTISSA_MASK) | LEADING_ONE) >> shift); + if (a >> 31) { + out = -out; + } + M3G_ASSERT(m3gInRange(out, -32767, 32767)); + *outVec++ = (M3Gshort) out; + } + } + } +} + +/*---------------------------------------------------------------------- + * Internal functions + *--------------------------------------------------------------------*/ + +/*--------------------------------------------------------------------*/ + +#if defined(M3G_SOFT_FLOAT) + +#if !defined (M3G_BUILD_ARM) + +static M3Gfloat m3gAdd(const M3Gfloat a, const M3Gfloat b) +{ + return m3gFloatAdd(FLOAT_AS_UINT(a), FLOAT_AS_UINT(b)); +} + +static M3Gfloat m3gMul(const M3Gfloat a, const M3Gfloat b) +{ + return m3gFloatMul(FLOAT_AS_UINT(a), FLOAT_AS_UINT(b)); +} + +#endif /* M3G_BUILD_ARM */ + +/*! + * \internal + * \brief Computes the reciprocal of the square root + * + * \param x a floating point value + * \return 1 / square root of \c x + */ +static M3Gfloat m3gRcpSqrt(const M3Gfloat x) +{ + /* Approximation followed by Newton-Raphson iteration a'la + * "Floating-point tricks" by J. Blinn, but we iterate several + * times to improve precision */ + + M3Gint i = (M3G_FLOAT_ONE + (M3G_FLOAT_ONE >> 1)) + - (FLOAT_AS_UINT(x) >> 1); + M3Gfloat y = INT_AS_FLOAT(i); + for (i = 0; i < 3; ++i) { + y = m3gMul(y, m3gSub(1.5f, m3gHalf(m3gMul(x, m3gSquare(y))))); + } + return y; +} + +/*! + * \internal + * \brief Computes the square root + * + * \param x a floating point value + * \return square root of \c x + */ +static M3Gfloat m3gSqrt(const M3Gfloat x) +{ + /* Approximation followed by Newton-Raphson iteration a'la + * "Floating-point tricks" by J. Blinn, but we iterate several + * times to improve precision */ + + M3Gint i = (FLOAT_AS_UINT(x) >> 1) + (M3G_FLOAT_ONE >> 1); + M3Gfloat y = INT_AS_FLOAT(i); + for (i = 0; i < 2; ++i) { + y = m3gDiv(m3gAdd(m3gSquare(y), x), m3gDouble(y)); + } + return y; +} + +#endif /* M3G_SOFT_FLOAT */ + +/*--------------------------------------------------------------------*/ + +#if defined(M3G_SOFT_FLOAT) +/*! + * \internal + */ +static M3Gfloat m3gArcCos(const M3Gfloat x) +{ + return (M3Gfloat) acos(x); +} + +/*! + * \internal + */ +static M3Gfloat m3gArcTan(const M3Gfloat y, const M3Gfloat x) +{ + return (M3Gfloat) atan2(y, x); +} + +/*! + * \internal + */ +static M3Gfloat m3gCos(const M3Gfloat x) +{ + return m3gSin(m3gAdd(x, HALF_PI)); +} + +/*! + * \internal + * \brief + */ +static M3Gfloat m3gSin(const M3Gfloat x) +{ + M3Gfloat f = x; + + /* If x is greater than two pi, do a modulo operation to bring it + * back in range for the internal sine function */ + + if (m3gAbs(f) >= TWO_PI) { + f = m3gMul (f, (1.f / TWO_PI)); + f = m3gFrac(f); + f = m3gMul (f, TWO_PI); + } + + /* Compute the result, negating both the input value and the + * result if x was negative */ + { + M3Guint i = FLOAT_AS_UINT(f); + M3Guint neg = (i & SIGN_MASK); + i ^= neg; + f = m3gSinFirstPeriod(INT_AS_FLOAT(i)); + i = FLOAT_AS_UINT(f) ^ neg; + return INT_AS_FLOAT(i); + } +} + +/*! + * \internal + */ +static M3Gfloat m3gTan(const M3Gfloat x) +{ + return (M3Gfloat) tan(x); +} + +/*! + * \internal + */ +static M3Gfloat m3gExp(const M3Gfloat a) +{ + return (M3Gfloat) exp(a); +} +#endif /* M3G_SOFT_FLOAT */ + +/*! + * \brief Checks whether the bottom row of a matrix is 0 0 0 1 + */ +static M3Gbool m3gIsWUnity(const Matrix *mtx) +{ + M3G_ASSERT_PTR(mtx); + + if (!m3gIsClassified(mtx)) { + return (IS_ZERO(M44F(mtx, 3, 0)) && + IS_ZERO(M44F(mtx, 3, 1)) && + IS_ZERO(M44F(mtx, 3, 2)) && + IS_ONE (M44F(mtx, 3, 3))); + } + else { + return ((mtx->mask & 0xC0C0C0C0) == (ELEM_ONE << 30)); + } +} + +/*! + * \brief Makes a quaternion by exponentiating a quaternion logarithm + */ +static void m3gExpQuat(Quat *quat, const Vec3 *qExp) +{ + M3Gfloat theta; + + M3G_ASSERT_PTR(quat); + M3G_ASSERT_PTR(qExp); + + theta = m3gSqrt(m3gAdd(m3gAdd(m3gSquare(qExp->x), + m3gSquare(qExp->y)), + m3gSquare(qExp->z))); + + if (theta > EPSILON) { + M3Gfloat s = m3gMul(m3gSin(theta), m3gRcp(theta)); + quat->x = m3gMul(qExp->x, s); + quat->y = m3gMul(qExp->y, s); + quat->z = m3gMul(qExp->z, s); + quat->w = m3gCos(theta); + } + else { + quat->x = quat->y = quat->z = 0.0f; + quat->w = 1.0f; + } +} + +/*! + * \brief Natural logarithm of a unit quaternion. + */ +static void m3gLogQuat(Vec3 *qLog, const Quat *quat) +{ + M3Gfloat sinTheta = m3gSqrt(m3gNorm3((const float *) quat)); + + if (sinTheta > EPSILON) { + M3Gfloat s = m3gArcTan(sinTheta, quat->w) / sinTheta; + qLog->x = m3gMul(s, quat->x); + qLog->y = m3gMul(s, quat->y); + qLog->z = m3gMul(s, quat->z); + } + else { + qLog->x = qLog->y = qLog->z = 0.0f; + } +} + +/*! + * \brief Make quaternion the "logarithmic difference" between two + * other quaternions. + */ +static void m3gLogDiffQuat(Vec3 *logDiff, + const Quat *from, const Quat *to) +{ + Quat temp; + temp.x = m3gNegate(from->x); + temp.y = m3gNegate(from->y); + temp.z = m3gNegate(from->z); + temp.w = from->w; + m3gMulQuat(&temp, to); + m3gLogQuat(logDiff, &temp); +} + +/*! + * \brief Rounds a float to the nearest integer + * + * Overflows are clamped to the maximum or minimum representable + * value. + */ +static M3Gint m3gRoundToInt(const M3Gfloat a) +{ + M3Guint base = FLOAT_AS_UINT(a); + M3Gint signMask, expo; + + /* Decompose the number into sign, exponent, and base number */ + + signMask = ((M3Gint) base >> 31); /* -> 0 or 0xFFFFFFFF */ + expo = (M3Gint)((base >> 23) & 0xFF) - 127; + + /* First check for large values and return either the negative or + * the positive maximum integer in case of overflow. The overflow + * check can be made on the exponent alone, as large floats are + * spaced several integer values apart so that nothing will + * overflow because of rounding later on */ + + if (expo >= 31) { + return (M3Gint)((1U << 31) - 1 + (((M3Guint) signMask) & 1)); + } + + /* Also check for underflow to avoid problems with shifting by + * more than 31 */ + + if (expo < -1) { + return 0; + } + + /* Mask out the sign and exponent bits, shift the base number so + * that the lowest bit corresponds to one half, then add one + * (half) and shift to round to the closest integer. */ + + base = (base | LEADING_ONE) << 8; /* shift mantissa to 1.31 */ + base = base >> (30 - expo); /* integer value as 31.1 */ + base = (base + 1) >> 1; /* round to nearest 32.0 */ + + /* Factor in the sign (negate if originally negative) and + * return */ + + return ((M3Gint) base ^ signMask) - signMask; +} + +/*! + * \brief Calculates ray-triangle intersection. + * + * http://www.acm.org/jgt/papers/MollerTrumbore97 + */ +static M3Gbool m3gIntersectTriangle(const Vec3 *orig, const Vec3 *dir, + const Vec3 *vert0, const Vec3 *vert1, const Vec3 *vert2, + Vec3 *tuv, M3Gint cullMode) +{ + Vec3 edge1, edge2, tvec, pvec, qvec; + M3Gfloat det,inv_det; + + /* find vectors for two edges sharing vert0 */ + edge1 = *vert1; + edge2 = *vert2; + m3gSubVec3(&edge1, vert0); + m3gSubVec3(&edge2, vert0); + + /* begin calculating determinant - also used to calculate U parameter */ + m3gCross(&pvec, dir, &edge2); + + /* if determinant is near zero, ray lies in plane of triangle */ + det = m3gDot3(&edge1, &pvec); + + if (cullMode == 0 && det <= 0) return M3G_FALSE; + if (cullMode == 1 && det >= 0) return M3G_FALSE; + + if (det > -EPSILON && det < EPSILON) + return M3G_FALSE; + inv_det = m3gRcp(det); + + /* calculate distance from vert0 to ray origin */ + tvec = *orig; + m3gSubVec3(&tvec, vert0); + + /* calculate U parameter and test bounds */ + tuv->y = m3gMul(m3gDot3(&tvec, &pvec), inv_det); + if (tuv->y < 0.0f || tuv->y > 1.0f) + return M3G_FALSE; + + /* prepare to test V parameter */ + m3gCross(&qvec, &tvec, &edge1); + + /* calculate V parameter and test bounds */ + tuv->z = m3gMul(m3gDot3(dir, &qvec), inv_det); + if (tuv->z < 0.0f || m3gAdd(tuv->y, tuv->z) > 1.0f) + return M3G_FALSE; + + /* calculate t, ray intersects triangle */ + tuv->x = m3gMul(m3gDot3(&edge2, &qvec), inv_det); + + return M3G_TRUE; +} + +/*! + * \brief Calculates ray-box intersection. + * + * http://www.siggraph.org/education/materials/HyperGraph/raytrace/rtinter3.htm + */ + +#define XO orig->x +#define YO orig->y +#define ZO orig->z +#define XD dir->x +#define YD dir->y +#define ZD dir->z + +#define XL box->min[0] +#define YL box->min[1] +#define ZL box->min[2] +#define XH box->max[0] +#define YH box->max[1] +#define ZH box->max[2] + +/*! + * \internal + * \brief Ray - bounding box intersection + * + */ +static M3Gbool m3gIntersectBox(const Vec3 *orig, const Vec3 *dir, const AABB *box) +{ + M3Gfloat tnear = M3G_MIN_NEGATIVE_FLOAT; + M3Gfloat tfar = M3G_MAX_POSITIVE_FLOAT; + M3Gfloat t1, t2, temp; + + /* X slab */ + if(XD != 0) { + t1 = m3gSub(XL, XO) / XD; + t2 = m3gSub(XH, XO) / XD; + + if(t1 > t2) { + temp = t1; + t1 = t2; + t2 = temp; + } + + if(t1 > tnear) tnear = t1; + if(t2 < tfar) tfar = t2; + + if(tnear > tfar) return M3G_FALSE; + if(tfar < 0) return M3G_FALSE; + } + else { + if(XO > XH || XO < XL) return M3G_FALSE; + } + + /* Y slab */ + if(YD != 0) { + t1 = m3gSub(YL, YO) / YD; + t2 = m3gSub(YH, YO) / YD; + + if(t1 > t2) { + temp = t1; + t1 = t2; + t2 = temp; + } + + if(t1 > tnear) tnear = t1; + if(t2 < tfar) tfar = t2; + + if(tnear > tfar) return M3G_FALSE; + if(tfar < 0) return M3G_FALSE; + } + else { + if(YO > YH || YO < YL) return M3G_FALSE; + } + + /* Z slab */ + if(ZD != 0) { + t1 = m3gSub(ZL, ZO) / ZD; + t2 = m3gSub(ZH, ZO) / ZD; + + if(t1 > t2) { + temp = t1; + t1 = t2; + t2 = temp; + } + + if(t1 > tnear) tnear = t1; + if(t2 < tfar) tfar = t2; + + if(tnear > tfar) return M3G_FALSE; + if(tfar < 0) return M3G_FALSE; + } + else { + if(ZO > ZH || ZO < ZL) return M3G_FALSE; + } + + return M3G_TRUE; +} + +/*! + * \brief Calculates the intersection of two rectangles. Always fills + * the intersection result. + * + * \param dst result of the intersection + * \param r1 rectangle 1 + * \param r2 rectangle 2 + */ +static M3Gbool m3gIntersectRectangle(M3GRectangle *dst, M3GRectangle *r1, M3GRectangle *r2) +{ + M3Gbool intersects = M3G_TRUE; + M3Gint min, max; + + max = (r1->x) >= (r2->x) ? (r1->x) : (r2->x); + min = (r1->x + r1->width) <= (r2->x + r2->width) ? (r1->x + r1->width) : (r2->x + r2->width); + if ((min - max) < 0) intersects = M3G_FALSE; + dst->x = max; + dst->width = min - max; + + max = (r1->y) >= (r2->y) ? (r1->y) : (r2->y); + min = (r1->y + r1->height) <= (r2->y + r2->height) ? (r1->y + r1->height) : (r2->y + r2->height); + if ((min - max) < 0) intersects = M3G_FALSE; + dst->y = max; + dst->height = min - max; + + return intersects; +} + +/*-------- float-to-int color conversions --------*/ + +static M3Guint m3gAlpha1f(M3Gfloat a) +{ + M3Guint alpha = (M3Guint) m3gFloatToByte(a); + return (alpha << 24) | M3G_RGB_MASK; +} + +static M3Guint m3gColor3f(M3Gfloat r, M3Gfloat g, M3Gfloat b) +{ + return ((M3Guint) m3gFloatToByte(r) << 16) + | ((M3Guint) m3gFloatToByte(g) << 8) + | (M3Guint) m3gFloatToByte(b) + | M3G_ALPHA_MASK; +} + +static M3Guint m3gColor4f(M3Gfloat r, M3Gfloat g, M3Gfloat b, M3Gfloat a) +{ + return ((M3Guint) m3gFloatToByte(r) << 16) + | ((M3Guint) m3gFloatToByte(g) << 8) + | (M3Guint) m3gFloatToByte(b) + | ((M3Guint) m3gFloatToByte(a) << 24); +} + +static void m3gFloatColor(M3Gint argb, M3Gfloat intensity, M3Gfloat *rgba) +{ + /* NOTE we intentionally aim a bit high here -- some GL + * implementations may round down instead of closest */ + + const M3Gfloat oneOver255 = (M3Gfloat)(1.0001 / 255.0); + + rgba[0] = (M3Gfloat)((argb >> 16) & 0xFF); + rgba[1] = (M3Gfloat)((argb >> 8) & 0xFF); + rgba[2] = (M3Gfloat)((argb ) & 0xFF); + rgba[3] = (M3Gfloat)((argb >> 24) & 0xFF); + + m3gScale4(rgba, m3gMul(oneOver255, intensity)); +} + +/*! + * \brief Converts the 3x3 submatrix of a matrix into fixed point + * + * The input matrix must be an affine one, i.e. the bottom row must be + * 0 0 0 1. The output matrix is guaranteed to be such that it can be + * multiplied with a 16-bit 3-vector without overflowing, while using + * the 32-bit range optimally. + * + * \param mtx the original matrix + * \param elem 9 shorts to hold the fixed point base numbers + * \return floating point exponent for the values in \c elem + * (number of bits to shift left for actual values) + */ +static M3Gint m3gGetFixedPoint3x3Basis(const Matrix *mtx, M3Gshort *elem) +{ + M3Gint outExp; + M3Gint row, col; + const M3Guint *m; + + if (!mtx->complete) { + m3gFillClassifiedMatrix((Matrix*) mtx); + } + m = (const M3Guint*) mtx->elem; + + /* First, find the maximum exponent value in the whole matrix */ + + outExp = 0; + for (col = 0; col < 3; ++col) { + for (row = 0; row < 3; ++row) { + M3Gint element = (M3Gint)(m[MELEM(row, col)] & ~SIGN_MASK); + outExp = M3G_MAX(outExp, element); + } + } + outExp >>= 23; + + /* Our candidate exponent is the maximum found plus 9, which is + * guaranteed to shift the maximum unsigned 24-bit mantissa (which + * always will have the high bit set) down to the signed 16-bit + * range */ + + outExp += 9; + + /* Now proceed to sum each row and see what's the actual smallest + * exponent we can safely use without overflowing in a 16+16 + * matrix-vector multiplication; this will win us one bit + * (doubling the precision) compared to the conservative approach + * of just shifting everything down by 10 bits */ + + for (row = 0; row < 3; ++row) { + + /* Sum the absolute values on this row */ + + M3Gint rowSum = 0; + for (col = 0; col < 3; ++col) { + M3Gint a = (M3Gint)(m[MELEM(row, col)] & ~SIGN_MASK); + M3Gint shift = outExp - (a >> 23); + M3G_ASSERT(shift < 265); + + if (shift < 24) { + rowSum += ((a & MANTISSA_MASK) | LEADING_ONE) >> shift; + } + } + + /* Now we have a 26-bit sum of the absolute values on this + * row, and shift that down until we fit the target range of + * [0, 65535]. Note that this still leaves *exactly* enough + * space for adding in an arbitrary 16-bit translation vector + * after multiplying with the matrix! */ + + while (rowSum >= (1 << 16)) { + rowSum >>= 1; + ++outExp; + } + } + + /* De-bias the exponent, but add in an extra 23 to account for the + * decimal bits in the floating point mantissa values we started + * with (we're returning the exponent as "bits to shift left to + * get integers", so we're off by 23 from IEEE notation) */ + + outExp = (outExp - 127) - 23; + + /* Finally, shift all the matrix elements to our final output + * precision */ + + for (col = 0; col < 3; ++col) { + m3gFloatVecToShort(3, mtx->elem + MELEM(0, col), outExp, elem); + elem += 3; + } + return outExp; +} + +/*! + * \brief Gets the translation component of a matrix as fixed point + * + * \param mtx the matrix + * \param elem 3 shorts to write the vector into + * \return floating point exponent for the values in \c elem + * (number of bits to shift left for actual values) + */ +static M3Gint m3gGetFixedPointTranslation(const Matrix *mtx, M3Gshort *elem) +{ + const M3Guint *m; + + M3G_ASSERT(m3gIsWUnity(mtx)); + if (!mtx->complete) { + m3gFillClassifiedMatrix((Matrix*) mtx); + } + m = (const M3Guint*) &mtx->elem[MELEM(0, 3)]; + + /* Find the maximum exponent, then scale down by 9 bits from that + * to shift the unsigned 24-bit mantissas to the signed 16-bit + * range */ + { + M3Gint outExp; + M3Guint maxElem = m[0] & ~SIGN_MASK; + maxElem = M3G_MAX(maxElem, m[1] & ~SIGN_MASK); + maxElem = M3G_MAX(maxElem, m[2] & ~SIGN_MASK); + + outExp = (M3Gint)(maxElem >> 23) - (127 + 23) + 9; + m3gFloatVecToShort(3, mtx->elem + MELEM(0, 3), outExp, elem); + return outExp; + } +} + +/*! + * \internal + * \brief Compute a bounding box enclosing two other boxes + * + * \param box box to fit + * \param a first box to enclose or NULL + * \param b second box to enclose or NULL + * + * \note If both input boxes are NULL, the box is not modified. + */ +static void m3gFitAABB(AABB *box, const AABB *a, const AABB *b) +{ + int i; + + M3G_ASSERT_PTR(box); + + if (a) { + m3gValidateAABB(a); + } + if (b) { + m3gValidateAABB(b); + } + + if (a && b) { + for (i = 0; i < 3; ++i) { + box->min[i] = M3G_MIN(a->min[i], b->min[i]); + box->max[i] = M3G_MAX(a->max[i], b->max[i]); + } + } + else if (a) { + *box = *a; + } + else if (b) { + *box = *b; + } +} + +/* + * \internal + * \brief Transform an axis-aligned bounding box with a matrix + * + * This results in a box that encloses the transformed original box. + * + * Based on "Transforming Axis-Aligned Bounding Boxes" by Jim Arvo + * from Graphics Gems. + * + * \note The bottom row of the matrix is ignored in the transformation. + */ +static void m3gTransformAABB(AABB *box, const Matrix *mtx) +{ + M3Gfloat boxMin[3], boxMax[3]; + M3Gfloat newMin[3], newMax[3]; + + m3gValidateAABB(box); + + if (!mtx->complete) { + m3gFillClassifiedMatrix((Matrix*) mtx); + } + + /* Get the original minimum and maximum extents as floats, and add + * the translation as the base for the transformed box */ + { + int i; + for (i = 0; i < 3; ++i) { + boxMin[i] = box->min[i]; + boxMax[i] = box->max[i]; + newMin[i] = newMax[i] = M44F(mtx, i, 3); + } + } + + /* Transform into the new minimum and maximum coordinates using + * the upper left 3x3 part of the matrix */ + { + M3Gint row, col; + + for (row = 0; row < 3; ++row) { + for (col = 0; col < 3; ++col) { + M3Gfloat a = m3gMul(M44F(mtx, row, col), boxMin[col]); + M3Gfloat b = m3gMul(M44F(mtx, row, col), boxMax[col]); + + if (a < b) { + newMin[row] = m3gAdd(newMin[row], a); + newMax[row] = m3gAdd(newMax[row], b); + } + else { + newMin[row] = m3gAdd(newMin[row], b); + newMax[row] = m3gAdd(newMax[row], a); + } + } + } + } + + /* Write back into the bounding box */ + { + int i; + for (i = 0; i < 3; ++i) { + box->min[i] = newMin[i]; + box->max[i] = newMax[i]; + } + } + + m3gValidateAABB(box); +} + +#if defined(M3G_DEBUG) +/*! + * \brief + */ +static void m3gValidateAABB(const AABB *aabb) +{ + m3gValidateFloats(6, (float*) aabb); +} +#endif + +/*---------------------------------------------------------------------- + * Public functions + *--------------------------------------------------------------------*/ + +/*! + * \brief Linear interpolation of vectors + * + * \param size number of components + * \param vec output vector + * \param s interpolation factor + * \param start initial value + * \param end target value + */ +#if defined(M3G_HW_FLOAT_VFPV2) + +__weak __asm void m3gLerp(M3Gint size, + M3Gfloat *vec, + M3Gfloat s, + const M3Gfloat *start, const M3Gfloat *end) +{ +// r0 = size +// r1 = *vec +// r2 = s +// r3 = *start +// sp[0] = *end + + EXPORT m3gLerp[DYNAMIC] + CODE32 +/* + M3Gfloat sCompl = 1.0 - s; + for (i = 0; i < size; ++i) { + vec[i] = sCompl*start[i] + s*end[i]; + } +*/ +// +// if size = 0, return +// + CMP r0, #0 + BXEQ lr + + FMSR s0, r2 + MOV r2, r3 + LDR r3, [sp] + + FLDS s1,=1.0 + STMFD sp!, {r4-r5} + FSUBS s2, s1, s0 // sCompl = 1 - s + + FMRX r4, FPSCR + CMP r0, #4 + BGT _m3gLerp_over4Loop + +// +// if 1 < size <= 4 +// +// set vector STRIDE = 1, LENGTH = 4 + BIC r12, r4, #0x00370000 + ORR r12, #(3<<16) + FMXR FPSCR, r12 + + FLDMIAS r2!, {s4-s7} // load the start[i] values + FLDMIAS r3!, {s8-s11} // load the end[i] values + FMULS s12, s4, s2 // [s12-s15] = [sCompl*start[0] .. sCompl*start[3]] + FMACS s12, s8, s0 // [s12-s15] += [ s*end[0] .. s*end[3]] + CMP r0, #1 + FSTS s12, [r1] + FSTSGT s13, [r1, #4] + CMP r0, #3 + FSTSGE s14, [r1, #8] + FSTSGT s15, [r1, #12] + + FMXR FPSCR, r4 + + LDMFD sp!, {r4-r5} + + BX lr +// +// if size > 4, interpolate 8 values in one loop +// +_m3gLerp_over4Loop + + FSTMFDD sp!, {d8-d9} + MOVS r5, r0, ASR #3 // size/8 + SUB r0, r0, r5, LSL #3 // tail length + +// set vector STRIDE = 1, LENGTH = 8 + BIC r12, r4, #0x00370000 + ORR r12, #(7<<16) + FMXR FPSCR, r12 + + +_m3gLerp_alignedLoop + + FLDMIASNE r2!, {s8-s15} // load the start[i] values + FLDMIASNE r3!, {s16-s23} // load the end[i] values + FMULSNE s24, s8, s2 // [s16-s23] = [ sCompl*start[0] sCompl*start[1] .. sCompl*start[7]] + FMACSNE s24, s16, s0 // [s16-s23] += [ s*end[0] s*end[1] .. s*end[7]] + FSTMIASNE r1!, {s24-s31} + SUBSNE r5, #1 + + BNE _m3gLerp_alignedLoop + +// process the 0-7 remaining values in the tail + + CMP r0, #1 + FLDMIASGE r2!, {s8-s15} + FLDMIASGE r3!, {s16-s23} + FMULSGE s24, s8, s2 + FMACSGE s24, s16, s0 + FSTSGE s24, [r1] + FSTSGT s25, [r1, #4] + CMP r0, #3 + FSTSGE s26, [r1, #8] + FSTSGT s27, [r1, #12] + CMP r0, #5 + FSTSGE s28, [r1, #16] + FSTSGT s29, [r1, #20] + CMP r0, #7 + FSTSEQ s30, [r1, #24] + + FMXR FPSCR, r4 + + FLDMFDD sp!, {d8-d9} + LDMFD sp!, {r4-r5} + + BX lr + +} +#else /* #if defined(M3G_HW_FLOAT_VFPV2) */ + +M3G_API void m3gLerp(M3Gint size, + M3Gfloat *vec, + M3Gfloat s, + const M3Gfloat *start, const M3Gfloat *end) +{ + int i; + + M3Gfloat sCompl = m3gSub(1.f, s); + for (i = 0; i < size; ++i) { + vec[i] = m3gAdd(m3gMul(sCompl, start[i]), m3gMul(s, end[i])); + } +} +#endif /* #if defined(M3G_HW_FLOAT_VFPV2) */ + +/*! + * \brief Hermite spline interpolation of vectors + * + * \param size number of components + * \param vec output vector + * \param s interpolation factor + * \param start start value vector + * \param end end value vector + * \param tStart start tangent vector + * \param tEnd end tangent vector + */ +M3G_API void m3gHermite(M3Gint size, + M3Gfloat *vec, + M3Gfloat s, + const M3Gfloat *start, const M3Gfloat *end, + const M3Gfloat *tStart, const M3Gfloat *tEnd) +{ + M3Gfloat s2 = m3gSquare(s); + M3Gfloat s3 = m3gMul(s2, s); + int i; + + for (i = 0; i < size; ++i) { + vec[i] = + m3gMadd(start[i], + m3gAdd(m3gSub(m3gDouble(s3), m3gMul(3.f, s2)), 1.f), + m3gMadd(end[i], + m3gSub(m3gMul(3.f, s2), m3gDouble(s3)), + m3gMadd(tStart[i], + m3gAdd(m3gSub(s3, m3gDouble(s2)), s), + m3gMul(tEnd[i], + m3gSub(s3, s2))))); + + } + + /* vec = ( 2*s3 - 3*s2 + 1) * start + + (-2*s3 + 3*s2 ) * end + + ( s3 - 2*s2 + s) * tStart + + ( s3 - s2 ) * tEnd; */ +} + +/*--------------------------------------------------------------------*/ + +/*! + * \brief Sets a matrix to a copy of another matrix + */ +M3G_API void m3gCopyMatrix(Matrix *dst, const Matrix *src) +{ + M3G_ASSERT(dst != NULL && src != NULL); + *dst = *src; +} + +/*! + * \brief Vector addition + */ +M3G_API void m3gAddVec3(Vec3 *vec, const Vec3 *other) +{ + vec->x = m3gAdd(vec->x, other->x); + vec->y = m3gAdd(vec->y, other->y); + vec->z = m3gAdd(vec->z, other->z); +} + +/*! + * \brief Vector addition + */ +M3G_API void m3gAddVec4(Vec4 *vec, const Vec4 *other) +{ + vec->x = m3gAdd(vec->x, other->x); + vec->y = m3gAdd(vec->y, other->y); + vec->z = m3gAdd(vec->z, other->z); + vec->w = m3gAdd(vec->w, other->w); +} + +/*! + * \brief Cross product of two 3D vectors expressed as 4D vectors + */ +M3G_API void m3gCross(Vec3 *dst, const Vec3 *a, const Vec3 *b) +{ + dst->x = m3gSub(m3gMul(a->y, b->z), m3gMul(a->z, b->y)); + dst->y = m3gSub(m3gMul(a->z, b->x), m3gMul(a->x, b->z)); + dst->z = m3gSub(m3gMul(a->x, b->y), m3gMul(a->y, b->x)); +} + +/*! + * \brief Dot product of two vectors + */ +M3G_API M3Gfloat m3gDot3(const Vec3 *a, const Vec3 *b) +{ + M3Gfloat d; + d = m3gMul(a->x, b->x); + d = m3gMadd(a->y, b->y, d); + d = m3gMadd(a->z, b->z, d); + return d; +} + +/*! + * \brief Dot product of two vectors + */ +M3G_API M3Gfloat m3gDot4(const Vec4 *a, const Vec4 *b) +{ + M3Gfloat d; + d = m3gMul(a->x, b->x); + d = m3gMadd(a->y, b->y, d); + d = m3gMadd(a->z, b->z, d); + d = m3gMadd(a->w, b->w, d); + return d; +} + +/*! + * \brief + */ +M3G_API void m3gSetVec3(Vec3 *v, M3Gfloat x, M3Gfloat y, M3Gfloat z) +{ + M3G_ASSERT_PTR(v); + v->x = x; + v->y = y; + v->z = z; +} + +/*! + * \brief + */ +M3G_API void m3gSetVec4(Vec4 *v, M3Gfloat x, M3Gfloat y, M3Gfloat z, M3Gfloat w) +{ + M3G_ASSERT_PTR(v); + v->x = x; + v->y = y; + v->z = z; + v->w = w; +} + +/*! + * \brief Vector subtraction + */ +M3G_API void m3gSubVec3(Vec3 *vec, const Vec3 *other) +{ + vec->x = m3gSub(vec->x, other->x); + vec->y = m3gSub(vec->y, other->y); + vec->z = m3gSub(vec->z, other->z); +} + +/*! + * \brief Vector subtraction + */ +M3G_API void m3gSubVec4(Vec4 *vec, const Vec4 *other) +{ + vec->x = m3gSub(vec->x, other->x); + vec->y = m3gSub(vec->y, other->y); + vec->z = m3gSub(vec->z, other->z); + vec->w = m3gSub(vec->w, other->w); +} + +/*! + * \brief Vector length + */ +M3G_API M3Gfloat m3gLengthVec3(const Vec3 *vec) +{ + return m3gSqrt(m3gAdd(m3gAdd(m3gSquare(vec->x), + m3gSquare(vec->y)), + m3gSquare(vec->z))); +} + +/*! + * \brief Vector scaling + */ +M3G_API void m3gScaleVec3(Vec3 *vec, const M3Gfloat s) +{ + vec->x = m3gMul(vec->x, s); + vec->y = m3gMul(vec->y, s); + vec->z = m3gMul(vec->z, s); +} + +/*! + * \brief Vector scaling + */ +M3G_API void m3gScaleVec4(Vec4 *vec, const M3Gfloat s) +{ + vec->x = m3gMul(vec->x, s); + vec->y = m3gMul(vec->y, s); + vec->z = m3gMul(vec->z, s); + vec->w = m3gMul(vec->w, s); +} + +/*! + * \brief Returns an angle-axis representation for a quaternion + * + * \note There are many, and this is not guaranteed to return a + * particular one + */ +M3G_API void m3gGetAngleAxis(const Quat *quat, M3Gfloat *angle, Vec3 *axis) +{ + M3Gfloat x, y, z, sinTheta; + + M3G_ASSERT_PTR(quat); + M3G_ASSERT_PTR(angle); + M3G_ASSERT_PTR(axis); + + x = quat->x; + y = quat->y; + z = quat->z; + + sinTheta = m3gSqrt(m3gAdd(m3gAdd(m3gSquare(x), m3gSquare(y)), + m3gSquare(z))); + + if (sinTheta > EPSILON) { + M3Gfloat ooSinTheta = m3gRcp(sinTheta); + axis->x = m3gMul(x, ooSinTheta); + axis->y = m3gMul(y, ooSinTheta); + axis->z = m3gMul(z, ooSinTheta); + } + else { + /* return a valid axis even for no rotation */ + axis->x = axis->y = 0.0f; + axis->z = 1.0f; + } + *angle = m3gMul(2.0f * RAD2DEG, m3gArcCos(quat->w)); +} + +/*! + * \brief Gets a single matrix column + */ +M3G_API void m3gGetMatrixColumn(const Matrix *mtx, M3Gint col, Vec4 *dst) +{ + if ((col & ~3) == 0) { + if (!mtx->complete) { + m3gFillClassifiedMatrix((Matrix*)mtx); + } + dst->x = M44F(mtx, 0, col); + dst->y = M44F(mtx, 1, col); + dst->z = M44F(mtx, 2, col); + dst->w = M44F(mtx, 3, col); + } + else { + M3G_ASSERT(M3G_FALSE); + } +} + +/*! + * \brief Returns the floating point values of a matrix as consecutive + * columns + */ +M3G_API void m3gGetMatrixColumns(const Matrix *mtx, M3Gfloat *dst) +{ + M3G_ASSERT(mtx != NULL && dst != NULL); + + if (!mtx->complete) { + m3gFillClassifiedMatrix((Matrix*)mtx); + } + m3gCopy(dst, mtx->elem, sizeof(mtx->elem)); +} + +/*! + * \brief Gets a single matrix row + */ +M3G_API void m3gGetMatrixRow(const Matrix *mtx, M3Gint row, Vec4 *dst) +{ + if ((row & ~3) == 0) { + if (!mtx->complete) { + m3gFillClassifiedMatrix((Matrix*)mtx); + } + dst->x = M44F(mtx, row, 0); + dst->y = M44F(mtx, row, 1); + dst->z = M44F(mtx, row, 2); + dst->w = M44F(mtx, row, 3); + } + else { + M3G_ASSERT(M3G_FALSE); + } +} + +/*! + * \brief Returns the floating point values of a matrix as consecutive + * rows + */ +M3G_API void m3gGetMatrixRows(const Matrix *mtx, M3Gfloat *dst) +{ + M3G_ASSERT(mtx != NULL && dst != NULL); + + if (!mtx->complete) { + m3gFillClassifiedMatrix((Matrix*)mtx); + } + { + int row; + for (row = 0; row < 4; ++row) { + *dst++ = mtx->elem[ 0 + row]; + *dst++ = mtx->elem[ 4 + row]; + *dst++ = mtx->elem[ 8 + row]; + *dst++ = mtx->elem[12 + row]; + } + } +} + +/*! + * \brief Sets a matrix to identity + */ +M3G_API void m3gIdentityMatrix(Matrix *mtx) +{ + M3G_ASSERT(mtx != NULL); + m3gClassifyAs(MC_IDENTITY, mtx); +} + +/*! + * \brief Sets a quaternion to identity + */ +M3G_API void m3gIdentityQuat(Quat *quat) +{ + M3G_ASSERT(quat != NULL); + quat->x = quat->y = quat->z = 0.0f; + quat->w = 1.0f; +} + +/*! + * \brief Inverts a matrix + */ +M3G_API M3Gbool m3gInvertMatrix(Matrix *mtx) +{ + M3Gfloat *matrix; + M3Gint i; + M3Gfloat tmp[12]; + M3Gfloat src[16]; + M3Gfloat det; + + M3G_ASSERT(mtx != NULL); + + if (!m3gIsClassified(mtx)) { + m3gClassify(mtx); + } + + /* Quick exit for identity */ + + if (mtx->mask == MC_IDENTITY) { + return M3G_TRUE; + } + + /* Look for other common cases; these require that we have valid + * values in all the elements, so fill in the values first */ + { + M3Guint mask = mtx->mask; + + if (!mtx->complete) { + m3gFillClassifiedMatrix(mtx); + } + + if ((mask | (0x3F << 24)) == MC_TRANSLATION) { + M44F(mtx, 0, 3) = m3gNegate(M44F(mtx, 0, 3)); + M44F(mtx, 1, 3) = m3gNegate(M44F(mtx, 1, 3)); + M44F(mtx, 2, 3) = m3gNegate(M44F(mtx, 2, 3)); + mtx->mask = MC_TRANSLATION; + return M3G_TRUE; + } + if ((mask | 0x300C03) == MC_SCALING) { + if ((mask & 3 ) == 0 || + (mask & (3 << 10)) == 0 || + (mask & (3 << 20)) == 0) { + return M3G_FALSE; /* zero scale for at least one axis */ + } + M44F(mtx, 0, 0) = m3gRcp(M44F(mtx, 0, 0)); + M44F(mtx, 1, 1) = m3gRcp(M44F(mtx, 1, 1)); + M44F(mtx, 2, 2) = m3gRcp(M44F(mtx, 2, 2)); + return M3G_TRUE; + } + } + + /* Do a full 4x4 inversion as a last resort */ + + matrix = mtx->elem; + + /* transpose matrix */ + for (i = 0; i < 4; i++) { + src[i] = matrix[i*4]; + src[i+4] = matrix[i*4+1]; + src[i+8] = matrix[i*4+2]; + src[i+12] = matrix[i*4+3]; + } + + /* calculate pairs for first 8 elements (cofactors) */ + tmp[0] = src[10]*src[15]; + tmp[1] = src[11]*src[14]; + tmp[2] = src[9]*src[15]; + tmp[3] = src[11]*src[13]; + tmp[4] = src[9]*src[14]; + tmp[5] = src[10]*src[13]; + tmp[6] = src[8]*src[15]; + tmp[7] = src[11]*src[12]; + tmp[8] = src[8]*src[14]; + tmp[9] = src[10]*src[12]; + tmp[10] = src[8]*src[13]; + tmp[11] = src[9]*src[12]; + + /* calculate first 8 elements (cofactors) */ + matrix[0] = tmp[0]*src[5] + tmp[3]*src[6] + tmp[4]*src[7]; + matrix[0] -= tmp[1]*src[5] + tmp[2]*src[6] + tmp[5]*src[7]; + matrix[1] = tmp[1]*src[4] + tmp[6]*src[6] + tmp[9]*src[7]; + matrix[1] -= tmp[0]*src[4] + tmp[7]*src[6] + tmp[8]*src[7]; + matrix[2] = tmp[2]*src[4] + tmp[7]*src[5] + tmp[10]*src[7]; + matrix[2] -= tmp[3]*src[4] + tmp[6]*src[5] + tmp[11]*src[7]; + matrix[3] = tmp[5]*src[4] + tmp[8]*src[5] + tmp[11]*src[6]; + matrix[3] -= tmp[4]*src[4] + tmp[9]*src[5] + tmp[10]*src[6]; + matrix[4] = tmp[1]*src[1] + tmp[2]*src[2] + tmp[5]*src[3]; + matrix[4] -= tmp[0]*src[1] + tmp[3]*src[2] + tmp[4]*src[3]; + matrix[5] = tmp[0]*src[0] + tmp[7]*src[2] + tmp[8]*src[3]; + matrix[5] -= tmp[1]*src[0] + tmp[6]*src[2] + tmp[9]*src[3]; + matrix[6] = tmp[3]*src[0] + tmp[6]*src[1] + tmp[11]*src[3]; + matrix[6] -= tmp[2]*src[0] + tmp[7]*src[1] + tmp[10]*src[3]; + matrix[7] = tmp[4]*src[0] + tmp[9]*src[1] + tmp[10]*src[2]; + matrix[7] -= tmp[5]*src[0] + tmp[8]*src[1] + tmp[11]*src[2]; + + /* calculate pairs for second 8 elements (cofactors) */ + tmp[0] = src[2]*src[7]; + tmp[1] = src[3]*src[6]; + tmp[2] = src[1]*src[7]; + tmp[3] = src[3]*src[5]; + tmp[4] = src[1]*src[6]; + tmp[5] = src[2]*src[5]; + tmp[6] = src[0]*src[7]; + tmp[7] = src[3]*src[4]; + tmp[8] = src[0]*src[6]; + tmp[9] = src[2]*src[4]; + tmp[10] = src[0]*src[5]; + tmp[11] = src[1]*src[4]; + + /* calculate second 8 elements (cofactors) */ + matrix[8] = tmp[0]*src[13] + tmp[3]*src[14] + tmp[4]*src[15]; + matrix[8] -= tmp[1]*src[13] + tmp[2]*src[14] + tmp[5]*src[15]; + matrix[9] = tmp[1]*src[12] + tmp[6]*src[14] + tmp[9]*src[15]; + matrix[9] -= tmp[0]*src[12] + tmp[7]*src[14] + tmp[8]*src[15]; + matrix[10] = tmp[2]*src[12] + tmp[7]*src[13] + tmp[10]*src[15]; + matrix[10] -= tmp[3]*src[12] + tmp[6]*src[13] + tmp[11]*src[15]; + matrix[11] = tmp[5]*src[12] + tmp[8]*src[13] + tmp[11]*src[14]; + matrix[11] -= tmp[4]*src[12] + tmp[9]*src[13] + tmp[10]*src[14]; + matrix[12] = tmp[2]*src[10] + tmp[5]*src[11] + tmp[1]*src[9]; + matrix[12] -= tmp[4]*src[11] + tmp[0]*src[9] + tmp[3]*src[10]; + matrix[13] = tmp[8]*src[11] + tmp[0]*src[8] + tmp[7]*src[10]; + matrix[13] -= tmp[6]*src[10] + tmp[9]*src[11] + tmp[1]*src[8]; + matrix[14] = tmp[6]*src[9] + tmp[11]*src[11] + tmp[3]*src[8]; + matrix[14] -= tmp[10]*src[11] + tmp[2]*src[8] + tmp[7]*src[9]; + matrix[15] = tmp[10]*src[10] + tmp[4]*src[8] + tmp[9]*src[9]; + matrix[15] -= tmp[8]*src[9] + tmp[11]*src[10] + tmp[5]*src[8]; + + /* calculate determinant */ + det = src[0]*matrix[0]+src[1]*matrix[1]+src[2]*matrix[2]+src[3]*matrix[3]; + + /* matrix has no inverse */ + if (det == 0.0f) { + return M3G_FALSE; + } + + /* calculate matrix inverse */ + det = 1/det; + for (i = 0; i < 16; i++) { + matrix[i] *= det; + } + + mtx->classified = M3G_FALSE; + return M3G_TRUE; +} + +/*! + * \brief Sets a matrix to the inverse of another matrix + */ +M3G_API M3Gbool m3gMatrixInverse(Matrix *mtx, const Matrix *other) +{ + M3G_ASSERT(mtx != NULL && other != NULL); + + if (!m3gIsClassified(other)) { + m3gClassify((Matrix*)other); + } + + m3gCopyMatrix(mtx, other); + return m3gInvertMatrix(mtx); +} + +/*! + * \brief Sets a matrix to the transpose of another matrix + */ +M3G_API void m3gMatrixTranspose(Matrix *mtx, const Matrix *other) +{ + M3Gbyte i; + M3G_ASSERT(mtx != NULL && other != NULL); + + if (!other->complete) { + m3gFillClassifiedMatrix((Matrix *)other); + } + + for (i = 0; i < 4; i++) { + mtx->elem[i] = other->elem[i*4]; + mtx->elem[i+4] = other->elem[i*4+1]; + mtx->elem[i+8] = other->elem[i*4+2]; + mtx->elem[i+12] = other->elem[i*4+3]; + } + mtx->classified = M3G_FALSE; + mtx->complete = M3G_TRUE; +} + +M3G_API M3Gbool m3gInverseTranspose(Matrix *mtx, const Matrix *other) +{ + Matrix temp; + if (!m3gMatrixInverse(&temp, other)) { + return M3G_FALSE; + } + m3gMatrixTranspose(mtx, &temp); + return M3G_TRUE; +} + +/*! + * \brief Sets a matrix to the product of two other matrices + * + * \note \c dst can not be either of \c left or \c right; if it is, + * the results are undefined + */ +M3G_API void m3gMatrixProduct(Matrix *dst, const Matrix *left, const Matrix *right) +{ + M3G_ASSERT_PTR(dst); + M3G_ASSERT_PTR(left); + M3G_ASSERT_PTR(right); + M3G_ASSERT(dst != left && dst != right); + + /* Classify input matrices and take early exits for identities */ + + if (!m3gIsClassified(left)) { + m3gClassify((Matrix*)left); + } + if (left->mask == MC_IDENTITY) { + m3gCopyMatrix(dst, right); + return; + } + + if (!m3gIsClassified(right)) { + m3gClassify((Matrix*)right); + } + if (right->mask == MC_IDENTITY) { + m3gCopyMatrix(dst, left); + return; + } + + /* Special quick paths for 3x4 matrices */ + + if (m3gIsWUnity(left) && m3gIsWUnity(right)) { + + /* Translation? */ + + if ((left->mask & ~MC_TRANSLATION_PART) == MC_IDENTITY) { + + if (left->mask != MC_TRANSLATION && !left->complete) { + m3gFillClassifiedMatrix((Matrix*)left); + } + if (right->mask != MC_TRANSLATION && !right->complete) { + m3gFillClassifiedMatrix((Matrix*)right); + } + + m3gCopyMatrix(dst, right); + + M44F(dst, 0, 3) = m3gAdd(M44F(left, 0, 3), M44F(dst, 0, 3)); + M44F(dst, 1, 3) = m3gAdd(M44F(left, 1, 3), M44F(dst, 1, 3)); + M44F(dst, 2, 3) = m3gAdd(M44F(left, 2, 3), M44F(dst, 2, 3)); + + dst->mask |= MC_TRANSLATION_PART; + return; + } + + if ((right->mask & ~MC_TRANSLATION_PART) == MC_IDENTITY) { + Vec4 tvec; + + if (left->mask != MC_TRANSLATION && !left->complete) { + m3gFillClassifiedMatrix((Matrix*)left); + } + if (right->mask != MC_TRANSLATION && !right->complete) { + m3gFillClassifiedMatrix((Matrix*)right); + } + + m3gCopyMatrix(dst, left); + + m3gGetMatrixColumn(right, 3, &tvec); + m3gTransformVec4(dst, &tvec); + + M44F(dst, 0, 3) = tvec.x; + M44F(dst, 1, 3) = tvec.y; + M44F(dst, 2, 3) = tvec.z; + + dst->mask |= MC_TRANSLATION_PART; + return; + } + + } + + /* Compute product and set output classification */ + + m3gGenericMatrixProduct(dst, left, right); +} + +/*! + * \brief Normalizes a quaternion + */ +M3G_API void m3gNormalizeQuat(Quat *q) +{ + M3Gfloat norm; + M3G_ASSERT_PTR(q); + + norm = m3gNorm4(&q->x); + + if (norm > EPSILON) { + norm = m3gRcpSqrt(norm); + m3gScale4(&q->x, norm); + } + else { + m3gIdentityQuat(q); + } +} + +/*! + * \brief Normalizes a three-vector + */ +M3G_API void m3gNormalizeVec3(Vec3 *v) +{ + M3Gfloat norm; + M3G_ASSERT_PTR(v); + + norm = m3gNorm3(&v->x); + + if (norm > EPSILON) { + norm = m3gRcpSqrt(norm); + m3gScale3(&v->x, norm); + } + else { + m3gZero(v, sizeof(Vec3)); + } +} + +/*! + * \brief Normalizes a four-vector + */ +M3G_API void m3gNormalizeVec4(Vec4 *v) +{ + M3Gfloat norm; + M3G_ASSERT_PTR(v); + + norm = m3gNorm4(&v->x); + + if (norm > EPSILON) { + norm = m3gRcpSqrt(norm); + m3gScale4(&v->x, norm); + } + else { + m3gZero(v, sizeof(Vec4)); + } +} + +/*! + * \brief Multiplies a matrix from the right with another matrix + */ +M3G_API void m3gPostMultiplyMatrix(Matrix *mtx, const Matrix *other) +{ + Matrix temp; + M3G_ASSERT_PTR(mtx); + M3G_ASSERT_PTR(other); + + m3gCopyMatrix(&temp, mtx); + m3gMatrixProduct(mtx, &temp, other); +} + +/*! + * \brief Multiplies a matrix from the left with another matrix + */ +M3G_API void m3gPreMultiplyMatrix(Matrix *mtx, const Matrix *other) +{ + Matrix temp; + M3G_ASSERT_PTR(mtx); + M3G_ASSERT_PTR(other); + + m3gCopyMatrix(&temp, mtx); + m3gMatrixProduct(mtx, other, &temp); +} + +/*! + * \brief Multiplies a matrix with a rotation matrix. + */ +M3G_API void m3gPostRotateMatrix(Matrix *mtx, + M3Gfloat angle, + M3Gfloat ax, M3Gfloat ay, M3Gfloat az) +{ + Quat q; + m3gSetAngleAxis(&q, angle, ax, ay, az); + m3gPostRotateMatrixQuat(mtx, &q); +} + +/*! + * \brief Multiplies a matrix with a translation matrix. + */ +M3G_API void m3gPostRotateMatrixQuat(Matrix *mtx, const Quat *quat) +{ + Matrix temp; + m3gQuatMatrix(&temp, quat); + m3gPostMultiplyMatrix(mtx, &temp); +} + +/*! + * \brief Multiplies a matrix with a scale matrix. + */ +M3G_API void m3gPostScaleMatrix(Matrix *mtx, M3Gfloat sx, M3Gfloat sy, M3Gfloat sz) +{ + Matrix temp; + m3gScalingMatrix(&temp, sx, sy, sz); + m3gPostMultiplyMatrix(mtx, &temp); +} + +/*! + * \brief Multiplies a matrix with a translation (matrix). + */ +M3G_API void m3gPostTranslateMatrix(Matrix *mtx, + M3Gfloat tx, M3Gfloat ty, M3Gfloat tz) +{ + Matrix temp; + m3gTranslationMatrix(&temp, tx, ty, tz); + m3gPostMultiplyMatrix(mtx, &temp); +} + +/*! + * \brief Multiplies a matrix with a rotation matrix + */ +M3G_API void m3gPreRotateMatrix(Matrix *mtx, + M3Gfloat angle, + M3Gfloat ax, M3Gfloat ay, M3Gfloat az) +{ + Quat q; + m3gSetAngleAxis(&q, angle, ax, ay, az); + m3gPreRotateMatrixQuat(mtx, &q); +} + +/*! + * \brief Multiplies a matrix with a quaternion rotation + */ +M3G_API void m3gPreRotateMatrixQuat(Matrix *mtx, const Quat *quat) +{ + Matrix temp; + m3gQuatMatrix(&temp, quat); + m3gPreMultiplyMatrix(mtx, &temp); +} + +/*! + * \brief Multiplies a matrix with a scale matrix. + */ +M3G_API void m3gPreScaleMatrix(Matrix *mtx, M3Gfloat sx, M3Gfloat sy, M3Gfloat sz) +{ + Matrix temp; + m3gScalingMatrix(&temp, sx, sy, sz); + m3gPreMultiplyMatrix(mtx, &temp); +} + +/*! + * \brief Multiplies a matrix with a translation (matrix). + */ +M3G_API void m3gPreTranslateMatrix(Matrix *mtx, + M3Gfloat tx, M3Gfloat ty, M3Gfloat tz) +{ + Matrix temp; + m3gTranslationMatrix(&temp, tx, ty, tz); + m3gPreMultiplyMatrix(mtx, &temp); +} + +/*! + * \brief Converts a quaternion into a matrix + * + * The output is a matrix effecting the same rotation as the + * quaternion passed as input + */ +M3G_API void m3gQuatMatrix(Matrix *mtx, const Quat *quat) +{ + M3Gfloat qx = quat->x; + M3Gfloat qy = quat->y; + M3Gfloat qz = quat->z; + M3Gfloat qw = quat->w; + + /* Quick exit for identity rotations */ + + if (IS_ZERO(qx) && IS_ZERO(qy) && IS_ZERO(qz)) { + m3gIdentityMatrix(mtx); + return; + } + + { + /* Determine the rough type of the output matrix */ + + M3Guint type = MC_SCALING_ROTATION; + if (IS_ZERO(qz) && IS_ZERO(qy)) { + type = MC_X_ROTATION; + } + else if (IS_ZERO(qz) && IS_ZERO(qx)) { + type = MC_Y_ROTATION; + } + else if (IS_ZERO(qx) && IS_ZERO(qy)) { + type = MC_Z_ROTATION; + } + m3gClassifyAs(type, mtx); + + /* Generate the non-constant parts of the matrix */ + { + M3Gfloat wx, wy, wz, xx, yy, yz, xy, xz, zz; + + xx = m3gMul(qx, qx); + xy = m3gMul(qx, qy); + xz = m3gMul(qx, qz); + yy = m3gMul(qy, qy); + yz = m3gMul(qy, qz); + zz = m3gMul(qz, qz); + wx = m3gMul(qw, qx); + wy = m3gMul(qw, qy); + wz = m3gMul(qw, qz); + + if (type != MC_X_ROTATION) { + M44F(mtx, 0, 0) = m3gSub(1.f, m3gDouble(m3gAdd(yy, zz))); + M44F(mtx, 0, 1) = m3gDouble(m3gSub(xy, wz)); + M44F(mtx, 0, 2) = m3gDouble(m3gAdd(xz, wy)); + } + + if (type != MC_Y_ROTATION) { + M44F(mtx, 1, 0) = m3gDouble(m3gAdd(xy, wz)); + M44F(mtx, 1, 1) = m3gSub(1.f, m3gDouble(m3gAdd(xx, zz))); + M44F(mtx, 1, 2) = m3gDouble(m3gSub(yz, wx)); + } + + if (type != MC_Z_ROTATION) { + M44F(mtx, 2, 0) = m3gDouble(m3gSub(xz, wy)); + M44F(mtx, 2, 1) = m3gDouble(m3gAdd(yz, wx)); + M44F(mtx, 2, 2) = m3gSub(1.f, m3gDouble(m3gAdd(xx, yy))); + } + } + m3gSubClassify(mtx); + } +} + +/*! + * \brief Generates a scaling matrix + */ +M3G_API void m3gScalingMatrix( + Matrix *mtx, + const M3Gfloat sx, const M3Gfloat sy, const M3Gfloat sz) +{ + M3G_ASSERT_PTR(mtx); + M44F(mtx, 0, 0) = sx; + M44F(mtx, 1, 1) = sy; + M44F(mtx, 2, 2) = sz; + m3gClassifyAs(MC_SCALING, mtx); + m3gSubClassify(mtx); +} + +/*! + * \brief Sets a quaternion to represent an angle-axis rotation + */ +M3G_API void m3gSetAngleAxis(Quat *quat, + M3Gfloat angle, + M3Gfloat ax, M3Gfloat ay, M3Gfloat az) +{ + m3gSetAngleAxisRad(quat, m3gMul(angle, M3G_DEG2RAD), ax, ay, az); +} + +/*! + * \brief Sets a quaternion to represent an angle-axis rotation + */ +M3G_API void m3gSetAngleAxisRad(Quat *quat, + M3Gfloat angleRad, + M3Gfloat ax, M3Gfloat ay, M3Gfloat az) +{ + M3G_ASSERT_PTR(quat); + + if (!IS_ZERO(angleRad)) { + M3Gfloat s; + M3Gfloat halfAngle = m3gHalf(angleRad); + + s = m3gSin(halfAngle); + + { + M3Gfloat sqrNorm = m3gMadd(ax, ax, m3gMadd(ay, ay, m3gMul(az, az))); + if (sqrNorm < 0.995f || sqrNorm > 1.005f) { + if (sqrNorm > EPSILON) { + M3Gfloat ooNorm = m3gRcpSqrt(sqrNorm); + ax = m3gMul(ax, ooNorm); + ay = m3gMul(ay, ooNorm); + az = m3gMul(az, ooNorm); + } + else { + ax = ay = az = 0.0f; + } + } + } + + quat->x = m3gMul(s, ax); + quat->y = m3gMul(s, ay); + quat->z = m3gMul(s, az); + quat->w = m3gCos(halfAngle); + } + else { + m3gIdentityQuat(quat); + } +} + +/*! + * \brief Quaternion multiplication. + */ +M3G_API void m3gMulQuat(Quat *quat, const Quat *other) +{ + Quat q; + q = *quat; + + quat->w = m3gMul(q.w, other->w) - m3gMul(q.x, other->x) - m3gMul(q.y, other->y) - m3gMul(q.z, other->z); + quat->x = m3gMul(q.w, other->x) + m3gMul(q.x, other->w) + m3gMul(q.y, other->z) - m3gMul(q.z, other->y); + quat->y = m3gMul(q.w, other->y) - m3gMul(q.x, other->z) + m3gMul(q.y, other->w) + m3gMul(q.z, other->x); + quat->z = m3gMul(q.w, other->z) + m3gMul(q.x, other->y) - m3gMul(q.y, other->x) + m3gMul(q.z, other->w); +} + +/*! + * \brief Makes this quaternion represent the rotation from one 3D + * vector to another + */ +M3G_API void m3gSetQuatRotation(Quat *quat, + const Vec3 *from, const Vec3 *to) +{ + M3Gfloat cosAngle; + + M3G_ASSERT_PTR(quat); + M3G_ASSERT_PTR(from); + M3G_ASSERT_PTR(to); + + cosAngle = m3gDot3(from, to); + + if (cosAngle > (1.0f - EPSILON)) { /* zero angle */ + m3gIdentityQuat(quat); + return; + } + else if (cosAngle > (1.0e-3f - 1.0f)) { /* normal case */ + Vec3 axis; + m3gCross(&axis, from, to); + m3gSetAngleAxisRad(quat, m3gArcCos(cosAngle), axis.x, axis.y, axis.z); + } + else { + + /* Opposite vectors; must generate an arbitrary perpendicular + * vector and use that as the rotation axis. Here, we try the + * Z axis first, and if that seems too parallel to the + * vectors, project the Y axis instead: Z is the only good + * choice for Z-constrained rotations, and Y by definition + * must be perpendicular to that. */ + + Vec3 axis, temp; + M3Gfloat s; + + axis.x = axis.y = axis.z = 0.0f; + if (m3gAbs(from->z) < (1.0f - EPSILON)) { + axis.z = 1.0f; + } + else { + axis.y = 1.0f; + } + + s = m3gDot3(&axis, from); + temp = *from; + m3gScaleVec3(&temp, s); + m3gSubVec3(&axis, &temp); + + m3gSetAngleAxis(quat, 180.f, axis.x, axis.y, axis.z); + } +} + +/*! + * \brief Sets the values of a matrix + * + */ +M3G_API void m3gSetMatrixColumns(Matrix *mtx, const M3Gfloat *src) +{ + M3G_ASSERT(mtx != NULL && src != NULL); + + m3gCopy(mtx->elem, src, sizeof(mtx->elem)); + mtx->classified = M3G_FALSE; + mtx->complete = M3G_TRUE; +} + +/*! + * \brief Sets the values of a matrix + * + */ +M3G_API void m3gSetMatrixRows(Matrix *mtx, const M3Gfloat *src) +{ + M3G_ASSERT(mtx != NULL && src != NULL); + { + int row; + for (row = 0; row < 4; ++row) { + mtx->elem[ 0 + row] = *src++; + mtx->elem[ 4 + row] = *src++; + mtx->elem[ 8 + row] = *src++; + mtx->elem[12 + row] = *src++; + } + } + mtx->classified = M3G_FALSE; + mtx->complete = M3G_TRUE; +} + +/*! + * \brief Transforms a 4-vector with a matrix + */ +#if defined(M3G_HW_FLOAT_VFPV2) + +__asm void _m3gTransformVec4(const Matrix *mtx, Vec4 *vec, M3Gint n) +{ + + CODE32 + + FSTMFDD sp!, {d8-d11} + + FMRX r3, FPSCR + BIC r12, r3, #0x00370000 + ORR r12, #(3<<16) + FMXR FPSCR, r12 + CMP r2, #4 + + FLDMIAS r0, {s4-s19} // [mtx0 mtx1 ..] + FLDMIAS r1, {s0-s3} // [vec.x vec.y vec.z vec.w] + FMULS s20, s4, s0 // [s20-s23] = [v.x*mtx0 v.x*mtx1 v.x*mtx2 v.x*mtx3 ] + FMACS s20, s8, s1 // [s20-s23] += [v.y*mtx4 v.y*mtx5 v.y*mtx6 v.y*mtx7 ] + FMACS s20, s12, s2 // [s20-s23] += [v.z*mtx8 v.z*mtx9 v.z*mtx10 v.z*mtx11] + FMACS s20, s16, s3 // [s20-s23] += [v.w*mtx12 v.w*mtx13 v.w*mtx14 v.w*mtx15] + FSTMIAS r1!, {s20-s22} + FSTMIASEQ r1, {s23} + + FMXR FPSCR, r3 + FLDMFDD sp!, {d8-d11} + + BX lr +} +#endif /* #if defined(M3G_HW_FLOAT_VFPV2) */ + +M3G_API void m3gTransformVec4(const Matrix *mtx, Vec4 *vec) +{ + M3Guint type; + M3G_ASSERT(mtx != NULL && vec != NULL); + + if (!m3gIsClassified(mtx)) { + m3gClassify((Matrix*)mtx); + } + + type = mtx->mask; + + if (type == MC_IDENTITY) { + return; + } + else { + int n = m3gIsWUnity(mtx) ? 3 : 4; + + if (!mtx->complete) { + m3gFillClassifiedMatrix((Matrix*)mtx); + } +#if defined(M3G_HW_FLOAT_VFPV2) + _m3gTransformVec4(mtx, vec, n); +#else + { + Vec4 v = *vec; + int i; + + for (i = 0; i < n; ++i) { + M3Gfloat d = m3gMul(M44F(mtx, i, 0), v.x); + d = m3gMadd(M44F(mtx, i, 1), v.y, d); + d = m3gMadd(M44F(mtx, i, 2), v.z, d); + d = m3gMadd(M44F(mtx, i, 3), v.w, d); + (&vec->x)[i] = d; + } + } +#endif + } +} + +/*! + * \brief Generates a translation matrix + */ +M3G_API void m3gTranslationMatrix( + Matrix *mtx, + const M3Gfloat tx, const M3Gfloat ty, const M3Gfloat tz) +{ + M3G_ASSERT_PTR(mtx); + M44F(mtx, 0, 3) = tx; + M44F(mtx, 1, 3) = ty; + M44F(mtx, 2, 3) = tz; + m3gClassifyAs(MC_TRANSLATION, mtx); + m3gSubClassify(mtx); +} + +/*! + * \brief Float vector assignment. + */ +M3G_API void m3gSetQuat(Quat *quat, const M3Gfloat *vec) +{ + quat->x = vec[0]; + quat->y = vec[1]; + quat->z = vec[2]; + quat->w = vec[3]; +} + +/*! + * \brief Slerp between quaternions q0 and q1, storing the result in quat. + */ +M3G_API void m3gSlerpQuat(Quat *quat, + M3Gfloat s, + const Quat *q0, const Quat *q1) +{ + M3Gfloat s0, s1; + M3Gfloat cosTheta = m3gDot4((const Vec4 *)q0, (const Vec4 *)q1); + M3Gfloat oneMinusS = m3gSub(1.0f, s); + + if (cosTheta > EPSILON - 1.0f) { + if (cosTheta < 1.0f - EPSILON) { + M3Gfloat theta = m3gArcCos(cosTheta); + M3Gfloat sinTheta = m3gSin(theta); + s0 = m3gSin(m3gMul(oneMinusS, theta)) / sinTheta; + s1 = m3gSin(m3gMul(s, theta)) / sinTheta; + } + else { + /* For quaternions very close to each other, use plain + linear interpolation for numerical stability. */ + s0 = oneMinusS; + s1 = s; + } + quat->x = m3gMadd(s0, q0->x, m3gMul(s1, q1->x)); + quat->y = m3gMadd(s0, q0->y, m3gMul(s1, q1->y)); + quat->z = m3gMadd(s0, q0->z, m3gMul(s1, q1->z)); + quat->w = m3gMadd(s0, q0->w, m3gMul(s1, q1->w)); + } + else { + /* Slerp is undefined if the two quaternions are (nearly) + opposite, so we just pick an arbitrary plane of + rotation here. */ + + quat->x = -q0->y; + quat->y = q0->x; + quat->z = -q0->w; + quat->w = q0->z; + + s0 = m3gSin(m3gMul(oneMinusS, HALF_PI)); + s1 = m3gSin(m3gMul(s, HALF_PI)); + + quat->x = m3gMadd(s0, q0->x, m3gMul(s1, quat->x)); + quat->y = m3gMadd(s0, q0->y, m3gMul(s1, quat->y)); + quat->z = m3gMadd(s0, q0->z, m3gMul(s1, quat->z)); + } +} + +/*! + * \brief Interpolate quaternions using spline, or "squad" interpolation. + */ +M3G_API void m3gSquadQuat(Quat *quat, + M3Gfloat s, + const Quat *q0, const Quat *a, const Quat *b, const Quat *q1) +{ + + Quat temp0; + Quat temp1; + m3gSlerpQuat(&temp0, s, q0, q1); + m3gSlerpQuat(&temp1, s, a, b); + + m3gSlerpQuat(quat, m3gDouble(m3gMul(s, m3gSub(1.0f, s))), &temp0, &temp1); +} + + +