1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/arm/math_vfp_asm.s Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,277 @@
1.4 +/*
1.5 + * Copyright (c) 2007
1.6 + * Josep Torra <josep@fluendo.com>. All rights reserved.
1.7 + *
1.8 + * Redistribution and use in source and binary forms, with or without
1.9 + * modification, are permitted provided that the following conditions
1.10 + * are met:
1.11 + * 1. Redistributions of source code must retain the above copyright
1.12 + * notice, this list of conditions and the following disclaimer.
1.13 + * 2. Redistributions in binary form must reproduce the above copyright
1.14 + * notice, this list of conditions and the following disclaimer in the
1.15 + * documentation and/or other materials provided with the distribution.
1.16 + *
1.17 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
1.18 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1.19 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1.20 + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
1.21 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1.22 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
1.23 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1.24 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1.25 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
1.26 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
1.27 + * SUCH DAMAGE.
1.28 + */
1.29 +
1.30 +#if __VFP_FP__
1.31 +/*
1.32 +** compile with -mcpu=arm1136j-s -mfpu=vfp -mfloat-abi=softfp
1.33 +**
1.34 +** void vfp_add_f32 (float *d, const float *s1, const float *s2, int n);
1.35 +** void vfp_add_f64 (double *d, const double *s1, const double *s2, int n);
1.36 +** void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n);
1.37 +** void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n);
1.38 +** void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n);
1.39 +** void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n);
1.40 +** void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n);
1.41 +** void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n);
1.42 +**
1.43 +** d: $r0 | s1: $r1 | s2: $r2 | n: $r3 |
1.44 +**
1.45 +*/
1.46 +
1.47 +#define UNROLL_F32_TEMPLATE(fname,finst) \
1.48 + .global vfp_ ## fname ## ; \
1.49 + vfp_ ## fname ## : \
1.50 + stmdb sp!, {fp, lr}; /* save registers to stack */ \
1.51 + ands ip, r3, #7; /* ip = n % 8 */ \
1.52 + beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
1.53 + vfp_ ## fname ## _loop1: \
1.54 + fldmias r1!, {s0}; \
1.55 + fldmias r2!, {s1}; \
1.56 + ## finst ##s s2, s0, s1; \
1.57 + fstmias r0!, {s2}; \
1.58 + subs ip, ip, #1; \
1.59 + bne vfp_ ## fname ## _loop1; \
1.60 + vfp_ ## fname ## _unroll: /* unroll by 8 */ \
1.61 + movs ip, r3, lsr #3; /* ip = n / 8 */ \
1.62 + beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
1.63 + fmrx lr, fpscr; /* read fpscr register into arm */\
1.64 + mov fp, #7; \
1.65 + orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
1.66 + fmxr fpscr, fp; \
1.67 + vfp_ ## fname ## _loop2: \
1.68 + fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \
1.69 + fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}; \
1.70 + ## finst ##s s24, s8, s16; \
1.71 + fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \
1.72 + subs ip, ip, #1; \
1.73 + bne vfp_ ## fname ## _loop2; \
1.74 + fmxr fpscr, lr; /* restore original fpscr */ \
1.75 + vfp_ ## fname ## _end: \
1.76 + ldmia sp!, {fp, pc}; /* recovering from stack and return */
1.77 +
1.78 +#define UNROLL_F64_TEMPLATE(fname,finst) \
1.79 + .global vfp_ ## fname ## ; \
1.80 + vfp_ ## fname ## : \
1.81 + stmdb sp!, {fp, lr}; /* save registers to stack */ \
1.82 + ands ip, r3, #3; /* ip = n % 3 */ \
1.83 + beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
1.84 + vfp_ ## fname ## _loop1: \
1.85 + fldmiad r1!, {d0}; \
1.86 + fldmiad r2!, {d1}; \
1.87 + ## finst ##d d2, d0, d1; \
1.88 + fstmiad r0!, {d2}; \
1.89 + subs ip, ip, #1; \
1.90 + bne vfp_ ## fname ## _loop1; \
1.91 + vfp_ ## fname ## _unroll: /* unroll by 4 */ \
1.92 + movs ip, r3, lsr #2; /* ip = n / 4 */ \
1.93 + beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
1.94 + fmrx lr, fpscr; /* read fpscr register into arm */\
1.95 + mov fp, #3; \
1.96 + orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
1.97 + fmxr fpscr, fp; \
1.98 + vfp_ ## fname ## _loop2: \
1.99 + fldmiad r1!, {d4, d5, d6, d7}; \
1.100 + fldmiad r2!, {d8, d9, d10, d11}; \
1.101 + ## finst ##d d12, d4, d8; \
1.102 + fstmiad r0!, {d12, d13, d14, d15}; \
1.103 + subs ip, ip, #1; \
1.104 + bne vfp_ ## fname ## _loop2; \
1.105 + fmxr fpscr, lr; /* restore original fpscr */ \
1.106 + vfp_ ## fname ## _end: \
1.107 + ldmia sp!, {fp, pc}; /* recovering from stack and return */
1.108 +
1.109 +.align 2
1.110 +UNROLL_F32_TEMPLATE(add_f32,fadd);
1.111 +UNROLL_F64_TEMPLATE(add_f64,fadd);
1.112 +
1.113 +UNROLL_F32_TEMPLATE(divide_f32,fdiv);
1.114 +UNROLL_F64_TEMPLATE(divide_f64,fdiv);
1.115 +
1.116 +UNROLL_F32_TEMPLATE(multiply_f32,fmul);
1.117 +UNROLL_F64_TEMPLATE(multiply_f64,fmul);
1.118 +
1.119 +UNROLL_F32_TEMPLATE(subtract_f32,fsub);
1.120 +UNROLL_F64_TEMPLATE(subtract_f64,fsub);
1.121 +
1.122 +#undef UNROLL_F32_TEMPLATE
1.123 +#undef UNROLL_F64_TEMPLATE
1.124 +
1.125 +/*
1.126 +**
1.127 +** void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n);
1.128 +** void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n);
1.129 +** void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n);
1.130 +** void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n);
1.131 +**
1.132 +** d: $r0 | s1: $r1 | s2_1: $r2 | n: $r3 |
1.133 +**
1.134 +*/
1.135 +#define UNROLL_F32_TEMPLATE(fname,finst) \
1.136 + .global vfp_ ## fname ## ; \
1.137 + vfp_ ## fname ## : \
1.138 + stmdb sp!, {fp, lr}; /* save registers to stack */ \
1.139 + fldmias r2, {s1}; /* load scalar value */ \
1.140 + ands ip, r3, #7; /* ip = n % 8 */ \
1.141 + beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
1.142 + vfp_ ## fname ## _loop1: \
1.143 + fldmias r1!, {s0}; \
1.144 + ## finst ##s s2, s0, s1; \
1.145 + fstmias r0!, {s2}; \
1.146 + subs ip, ip, #1; \
1.147 + bne vfp_ ## fname ## _loop1; \
1.148 + vfp_ ## fname ## _unroll: /* unroll by 8 */ \
1.149 + movs ip, r3, lsr #3; /* ip = n / 8 */ \
1.150 + beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
1.151 + fmrx lr, fpscr; /* read fpscr register into arm */\
1.152 + mov fp, #7; \
1.153 + orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
1.154 + fmxr fpscr, fp; \
1.155 + vfp_ ## fname ## _loop2: \
1.156 + fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \
1.157 + ## finst ##s s24, s8, s1; \
1.158 + fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \
1.159 + subs ip, ip, #1; \
1.160 + bne vfp_ ## fname ## _loop2; \
1.161 + fmxr fpscr, lr; /* restore original fpscr */ \
1.162 + vfp_ ## fname ## _end: \
1.163 + ldmia sp!, {fp, pc}; /* recovering from stack and return */
1.164 +
1.165 +#define UNROLL_F64_TEMPLATE(fname,finst) \
1.166 + .global vfp_ ## fname ## ; \
1.167 + vfp_ ## fname ## : \
1.168 + stmdb sp!, {fp, lr}; /* save registers to stack */ \
1.169 + fldmiad r2, {d1}; /* load scalar value */ \
1.170 + ands ip, r3, #3; /* ip = n % 3 */ \
1.171 + beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
1.172 + vfp_ ## fname ## _loop1: \
1.173 + fldmiad r1!, {d0}; \
1.174 + ## finst ##d d2, d0, d1; \
1.175 + fstmiad r0!, {d2}; \
1.176 + subs ip, ip, #1; \
1.177 + bne vfp_ ## fname ## _loop1; \
1.178 + vfp_ ## fname ## _unroll: /* unroll by 4 */ \
1.179 + movs ip, r3, lsr #2; /* ip = n / 4 */ \
1.180 + beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
1.181 + fmrx lr, fpscr; /* read fpscr register into arm */\
1.182 + mov fp, #3; \
1.183 + orr fp, lr, fp, lsl #16; /* set vector lenght to 4 */ \
1.184 + fmxr fpscr, fp; \
1.185 + vfp_ ## fname ## _loop2: \
1.186 + fldmiad r1!, {d4, d5, d6, d7}; \
1.187 + ## finst ##d d12, d4, d1; \
1.188 + fstmiad r0!, {d12, d13, d14, d15}; \
1.189 + subs ip, ip, #1; \
1.190 + bne vfp_ ## fname ## _loop2; \
1.191 + fmxr fpscr, lr; /* restore original fpscr */ \
1.192 + vfp_ ## fname ## _end: \
1.193 + ldmia sp!, {fp, pc}; /* recovering from stack and return */
1.194 +
1.195 +UNROLL_F32_TEMPLATE(scalaradd_f32_ns,fadd);
1.196 +UNROLL_F64_TEMPLATE(scalaradd_f64_ns,fadd);
1.197 +
1.198 +UNROLL_F32_TEMPLATE(scalarmultiply_f32_ns,fmul);
1.199 +UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul);
1.200 +
1.201 +#undef UNROLL_F32_TEMPLATE
1.202 +#undef UNROLL_F64_TEMPLATE
1.203 +
1.204 +/*
1.205 +**
1.206 +** void vfp_abs_f32_f32_ns(float *d, const float *s, int n);
1.207 +** void vfp_abs_f64_f64_ns(double *d, const double *s, int n);
1.208 +** void vfp_negative_f32(float *d, const float *s, int n);
1.209 +** void vfp_negative_f64(double *d, const double *s, int n);
1.210 +**
1.211 +** d: $r0 | s: $r1 | n: $r2 |
1.212 +**
1.213 +*/
1.214 +#define UNROLL_F32_TEMPLATE(fname,finst) \
1.215 + .global vfp_ ## fname ## ; \
1.216 + vfp_ ## fname ## : \
1.217 + stmdb sp!, {fp, lr}; /* save registers to stack */ \
1.218 + ands ip, r2, #7; /* ip = n % 8 */ \
1.219 + beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
1.220 + vfp_ ## fname ## _loop1: \
1.221 + fldmias r1!, {s0}; \
1.222 + ## finst ##s s2, s0; \
1.223 + fstmias r0!, {s2}; \
1.224 + subs ip, ip, #1; \
1.225 + bne vfp_ ## fname ## _loop1; \
1.226 + vfp_ ## fname ## _unroll: /* unroll by 8 */ \
1.227 + movs ip, r2, lsr #3; /* ip = n / 8 */ \
1.228 + beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
1.229 + fmrx lr, fpscr; /* read fpscr register into arm */\
1.230 + mov fp, #7; \
1.231 + orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
1.232 + fmxr fpscr, fp; \
1.233 + vfp_ ## fname ## _loop2: \
1.234 + fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \
1.235 + ## finst ##s s24, s8; \
1.236 + fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \
1.237 + subs ip, ip, #1; \
1.238 + bne vfp_ ## fname ## _loop2; \
1.239 + fmxr fpscr, lr; /* restore original fpscr */ \
1.240 + vfp_ ## fname ## _end: \
1.241 + ldmia sp!, {fp, pc}; /* recovering from stack and return */
1.242 +
1.243 +#define UNROLL_F64_TEMPLATE(fname,finst) \
1.244 + .global vfp_ ## fname ## ; \
1.245 + vfp_ ## fname ## : \
1.246 + stmdb sp!, {fp, lr}; /* save registers to stack */ \
1.247 + ands ip, r2, #3; /* ip = n % 3 */ \
1.248 + beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
1.249 + vfp_ ## fname ## _loop1: \
1.250 + fldmiad r1!, {d0}; \
1.251 + ## finst ##d d2, d0; \
1.252 + fstmiad r0!, {d2}; \
1.253 + subs ip, ip, #1; \
1.254 + bne vfp_ ## fname ## _loop1; \
1.255 + vfp_ ## fname ## _unroll: /* unroll by 4 */ \
1.256 + movs ip, r2, lsr #2; /* ip = n / 4 */ \
1.257 + beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
1.258 + fmrx lr, fpscr; /* read fpscr register into arm */\
1.259 + mov fp, #3; \
1.260 + orr fp, lr, fp, lsl #16; /* set vector lenght to 4 */ \
1.261 + fmxr fpscr, fp; \
1.262 + vfp_ ## fname ## _loop2: \
1.263 + fldmiad r1!, {d4, d5, d6, d7}; \
1.264 + ## finst ##d d12, d4; \
1.265 + fstmiad r0!, {d12, d13, d14, d15}; \
1.266 + subs ip, ip, #1; \
1.267 + bne vfp_ ## fname ## _loop2; \
1.268 + fmxr fpscr, lr; /* restore original fpscr */ \
1.269 + vfp_ ## fname ## _end: \
1.270 + ldmia sp!, {fp, pc}; /* recovering from stack and return */
1.271 +
1.272 +UNROLL_F32_TEMPLATE(abs_f32_f32_ns,fabs);
1.273 +UNROLL_F64_TEMPLATE(abs_f64_f64_ns,fabs);
1.274 +
1.275 +UNROLL_F32_TEMPLATE(negative_f32,fneg);
1.276 +UNROLL_F64_TEMPLATE(negative_f64,fneg);
1.277 +
1.278 +#undef UNROLL_F32_TEMPLATE
1.279 +#undef UNROLL_F64_TEMPLATE
1.280 +#endif