Update contrib.
3 * Josep Torra <josep@fluendo.com>. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 ** compile with -mcpu=arm1136j-s -mfpu=vfp -mfloat-abi=softfp
31 ** void vfp_add_f32 (float *d, const float *s1, const float *s2, int n);
32 ** void vfp_add_f64 (double *d, const double *s1, const double *s2, int n);
33 ** void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n);
34 ** void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n);
35 ** void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n);
36 ** void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n);
37 ** void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n);
38 ** void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n);
40 ** d: $r0 | s1: $r1 | s2: $r2 | n: $r3 |
44 #define UNROLL_F32_TEMPLATE(fname,finst) \
45 .global vfp_ ## fname ## ; \
47 stmdb sp!, {fp, lr}; /* save registers to stack */ \
48 ands ip, r3, #7; /* ip = n % 8 */ \
49 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
50 vfp_ ## fname ## _loop1: \
53 ## finst ##s s2, s0, s1; \
56 bne vfp_ ## fname ## _loop1; \
57 vfp_ ## fname ## _unroll: /* unroll by 8 */ \
58 movs ip, r3, lsr #3; /* ip = n / 8 */ \
59 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
60 fmrx lr, fpscr; /* read fpscr register into arm */\
62 orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
64 vfp_ ## fname ## _loop2: \
65 fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \
66 fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}; \
67 ## finst ##s s24, s8, s16; \
68 fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \
70 bne vfp_ ## fname ## _loop2; \
71 fmxr fpscr, lr; /* restore original fpscr */ \
72 vfp_ ## fname ## _end: \
73 ldmia sp!, {fp, pc}; /* recovering from stack and return */
75 #define UNROLL_F64_TEMPLATE(fname,finst) \
76 .global vfp_ ## fname ## ; \
78 stmdb sp!, {fp, lr}; /* save registers to stack */ \
79 ands ip, r3, #3; /* ip = n % 3 */ \
80 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
81 vfp_ ## fname ## _loop1: \
84 ## finst ##d d2, d0, d1; \
87 bne vfp_ ## fname ## _loop1; \
88 vfp_ ## fname ## _unroll: /* unroll by 4 */ \
89 movs ip, r3, lsr #2; /* ip = n / 4 */ \
90 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
91 fmrx lr, fpscr; /* read fpscr register into arm */\
93 orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
95 vfp_ ## fname ## _loop2: \
96 fldmiad r1!, {d4, d5, d6, d7}; \
97 fldmiad r2!, {d8, d9, d10, d11}; \
98 ## finst ##d d12, d4, d8; \
99 fstmiad r0!, {d12, d13, d14, d15}; \
101 bne vfp_ ## fname ## _loop2; \
102 fmxr fpscr, lr; /* restore original fpscr */ \
103 vfp_ ## fname ## _end: \
104 ldmia sp!, {fp, pc}; /* recovering from stack and return */
107 UNROLL_F32_TEMPLATE(add_f32,fadd);
108 UNROLL_F64_TEMPLATE(add_f64,fadd);
110 UNROLL_F32_TEMPLATE(divide_f32,fdiv);
111 UNROLL_F64_TEMPLATE(divide_f64,fdiv);
113 UNROLL_F32_TEMPLATE(multiply_f32,fmul);
114 UNROLL_F64_TEMPLATE(multiply_f64,fmul);
116 UNROLL_F32_TEMPLATE(subtract_f32,fsub);
117 UNROLL_F64_TEMPLATE(subtract_f64,fsub);
119 #undef UNROLL_F32_TEMPLATE
120 #undef UNROLL_F64_TEMPLATE
124 ** void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n);
125 ** void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n);
126 ** void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n);
127 ** void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n);
129 ** d: $r0 | s1: $r1 | s2_1: $r2 | n: $r3 |
132 #define UNROLL_F32_TEMPLATE(fname,finst) \
133 .global vfp_ ## fname ## ; \
135 stmdb sp!, {fp, lr}; /* save registers to stack */ \
136 fldmias r2, {s1}; /* load scalar value */ \
137 ands ip, r3, #7; /* ip = n % 8 */ \
138 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
139 vfp_ ## fname ## _loop1: \
141 ## finst ##s s2, s0, s1; \
144 bne vfp_ ## fname ## _loop1; \
145 vfp_ ## fname ## _unroll: /* unroll by 8 */ \
146 movs ip, r3, lsr #3; /* ip = n / 8 */ \
147 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
148 fmrx lr, fpscr; /* read fpscr register into arm */\
150 orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
152 vfp_ ## fname ## _loop2: \
153 fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \
154 ## finst ##s s24, s8, s1; \
155 fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \
157 bne vfp_ ## fname ## _loop2; \
158 fmxr fpscr, lr; /* restore original fpscr */ \
159 vfp_ ## fname ## _end: \
160 ldmia sp!, {fp, pc}; /* recovering from stack and return */
162 #define UNROLL_F64_TEMPLATE(fname,finst) \
163 .global vfp_ ## fname ## ; \
165 stmdb sp!, {fp, lr}; /* save registers to stack */ \
166 fldmiad r2, {d1}; /* load scalar value */ \
167 ands ip, r3, #3; /* ip = n % 3 */ \
168 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
169 vfp_ ## fname ## _loop1: \
171 ## finst ##d d2, d0, d1; \
174 bne vfp_ ## fname ## _loop1; \
175 vfp_ ## fname ## _unroll: /* unroll by 4 */ \
176 movs ip, r3, lsr #2; /* ip = n / 4 */ \
177 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
178 fmrx lr, fpscr; /* read fpscr register into arm */\
180 orr fp, lr, fp, lsl #16; /* set vector lenght to 4 */ \
182 vfp_ ## fname ## _loop2: \
183 fldmiad r1!, {d4, d5, d6, d7}; \
184 ## finst ##d d12, d4, d1; \
185 fstmiad r0!, {d12, d13, d14, d15}; \
187 bne vfp_ ## fname ## _loop2; \
188 fmxr fpscr, lr; /* restore original fpscr */ \
189 vfp_ ## fname ## _end: \
190 ldmia sp!, {fp, pc}; /* recovering from stack and return */
192 UNROLL_F32_TEMPLATE(scalaradd_f32_ns,fadd);
193 UNROLL_F64_TEMPLATE(scalaradd_f64_ns,fadd);
195 UNROLL_F32_TEMPLATE(scalarmultiply_f32_ns,fmul);
196 UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul);
198 #undef UNROLL_F32_TEMPLATE
199 #undef UNROLL_F64_TEMPLATE
203 ** void vfp_abs_f32_f32_ns(float *d, const float *s, int n);
204 ** void vfp_abs_f64_f64_ns(double *d, const double *s, int n);
205 ** void vfp_negative_f32(float *d, const float *s, int n);
206 ** void vfp_negative_f64(double *d, const double *s, int n);
208 ** d: $r0 | s: $r1 | n: $r2 |
211 #define UNROLL_F32_TEMPLATE(fname,finst) \
212 .global vfp_ ## fname ## ; \
214 stmdb sp!, {fp, lr}; /* save registers to stack */ \
215 ands ip, r2, #7; /* ip = n % 8 */ \
216 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
217 vfp_ ## fname ## _loop1: \
219 ## finst ##s s2, s0; \
222 bne vfp_ ## fname ## _loop1; \
223 vfp_ ## fname ## _unroll: /* unroll by 8 */ \
224 movs ip, r2, lsr #3; /* ip = n / 8 */ \
225 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
226 fmrx lr, fpscr; /* read fpscr register into arm */\
228 orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \
230 vfp_ ## fname ## _loop2: \
231 fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \
232 ## finst ##s s24, s8; \
233 fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \
235 bne vfp_ ## fname ## _loop2; \
236 fmxr fpscr, lr; /* restore original fpscr */ \
237 vfp_ ## fname ## _end: \
238 ldmia sp!, {fp, pc}; /* recovering from stack and return */
240 #define UNROLL_F64_TEMPLATE(fname,finst) \
241 .global vfp_ ## fname ## ; \
243 stmdb sp!, {fp, lr}; /* save registers to stack */ \
244 ands ip, r2, #3; /* ip = n % 3 */ \
245 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \
246 vfp_ ## fname ## _loop1: \
248 ## finst ##d d2, d0; \
251 bne vfp_ ## fname ## _loop1; \
252 vfp_ ## fname ## _unroll: /* unroll by 4 */ \
253 movs ip, r2, lsr #2; /* ip = n / 4 */ \
254 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \
255 fmrx lr, fpscr; /* read fpscr register into arm */\
257 orr fp, lr, fp, lsl #16; /* set vector lenght to 4 */ \
259 vfp_ ## fname ## _loop2: \
260 fldmiad r1!, {d4, d5, d6, d7}; \
261 ## finst ##d d12, d4; \
262 fstmiad r0!, {d12, d13, d14, d15}; \
264 bne vfp_ ## fname ## _loop2; \
265 fmxr fpscr, lr; /* restore original fpscr */ \
266 vfp_ ## fname ## _end: \
267 ldmia sp!, {fp, pc}; /* recovering from stack and return */
269 UNROLL_F32_TEMPLATE(abs_f32_f32_ns,fabs);
270 UNROLL_F64_TEMPLATE(abs_f64_f64_ns,fabs);
272 UNROLL_F32_TEMPLATE(negative_f32,fneg);
273 UNROLL_F64_TEMPLATE(negative_f64,fneg);
275 #undef UNROLL_F32_TEMPLATE
276 #undef UNROLL_F64_TEMPLATE