Update contrib.
11 EXPORT_C __NAKED__ void vfp_add_f32 (float *d, const float *s1, const float *s2, int n)
13 asm(" stmdb sp!, {fp, lr}");
14 asm("ands ip, r3, #7");
15 asm("beq vfp_add_f32_unroll");
17 //asm("fldmias r1!, {s0}");
18 VFP_FLDMIAS(CC_AL,1,0,1);
20 asm("vfp_add_f32_loop1: ");
22 //asm("fldmias r2!, {s1}");
23 VFP_FLDMIAS(CC_AL,2,1,1);
25 //asm("fadds s2, s0, s1");
26 VFP_FADDS(CC_AL,2,0,1);
28 //asm("fstmias r0!, {s2}");
29 VFP_FSTMIAS(CC_AL,0,2,1);
31 asm("subs ip, ip, #1");
32 asm("bne vfp_add_f32_loop1 ");
33 asm("vfp_add_f32_unroll: movs ip, r3, lsr #3");
34 asm("beq vfp_add_f32_end");
37 //asm("fmrx lr, fpscr");
38 VFP_FMRX(,14,VFP_XREG_FPSCR);
42 asm("orr fp, lr, fp, lsl #16");
44 //asm("fmxr fpscr, fp");
45 VFP_FMXR(,VFP_XREG_FPSCR,11);
48 asm("vfp_add_f32_loop2:");
50 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
51 VFP_FLDMIAS(CC_AL,1,8,8);
53 //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
54 VFP_FLDMIAS(CC_AL,2,16,8);
56 //asm("fadds s24, s8, s16");
57 VFP_FADDS(CC_AL,24,8,16);
59 asm("subs ip, ip, #1");
60 asm("bne vfp_add_f32_loop2");
62 //asm("fmxr fpscr, lr");
63 VFP_FMXR(,VFP_XREG_FPSCR,14);
65 asm("vfp_add_f32_end:");
66 asm ("ldmia sp!, {fp, pc}");
71 EXPORT_C __NAKED__ void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n)
73 asm(" stmdb sp!, {fp, lr}");
74 asm("ands ip, r3, #7");
75 asm("beq vfp_divide_f32_unroll");
77 //asm("fldmias r1!, {s0}");
78 VFP_FLDMIAS(CC_AL,1,0,1);
80 asm("vfp_divide_f32_loop1:");
82 //asm("fldmias r2!, {s1}");
83 VFP_FLDMIAS(CC_AL,2,1,1);
85 //asm("fadds s2, s0, s1");
86 VFP_FDIVS(CC_AL,2,0,1);
88 //asm("fstmias r0!, {s2}");
89 VFP_FSTMIAS(CC_AL,0,2,1);
91 asm("subs ip, ip, #1");
92 asm("bne vfp_divide_f32_loop1");
93 asm("vfp_divide_f32_unroll: movs ip, r3, lsr #3");
94 asm("beq vfp_divide_f32_end");
97 //asm("fmrx lr, fpscr");
98 VFP_FMRX(,14,VFP_XREG_FPSCR);
102 asm("orr fp, lr, fp, lsl #16");
104 //asm("fmxr fpscr, fp");
105 VFP_FMXR(,VFP_XREG_FPSCR,11);
108 asm("vfp_divide_f32_loop2:");
110 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
111 VFP_FLDMIAS(CC_AL,1,8,8);
113 //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
114 VFP_FLDMIAS(CC_AL,2,16,8);
116 //asm("fadds s24, s8, s16");
117 VFP_FDIVS(CC_AL,24,8,16);
119 asm("subs ip, ip, #1");
120 asm("bne vfp_divide_f32_loop2");
122 //asm("fmxr fpscr, lr");
123 VFP_FMXR(,VFP_XREG_FPSCR,14);
125 asm("vfp_divide_f32_end:");
126 asm ("ldmia sp!, {fp, pc}");
130 EXPORT_C __NAKED__ void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n)
132 asm(" stmdb sp!, {fp, lr}");
133 asm("ands ip, r3, #7");
134 asm("beq vfp_multiply_f32_unroll");
136 //asm("fldmias r1!, {s0}");
137 VFP_FLDMIAS(CC_AL,1,0,1);
139 asm("vfp_multiply_f32_loop1:");
141 //asm("fldmias r2!, {s1}");
142 VFP_FLDMIAS(CC_AL,2,1,1);
144 //asm("fadds s2, s0, s1");
145 VFP_FMULS(CC_AL,2,0,1);
147 //asm("fstmias r0!, {s2}");
148 VFP_FSTMIAS(CC_AL,0,2,1);
150 asm("subs ip, ip, #1");
151 asm("bne vfp_multiply_f32_loop1");
152 asm("vfp_multiply_f32_unroll: movs ip, r3, lsr #3");
153 asm("beq vfp_multiply_f32_end");
156 //asm("fmrx lr, fpscr");
157 VFP_FMRX(,14,VFP_XREG_FPSCR);
161 asm("orr fp, lr, fp, lsl #16");
163 //asm("fmxr fpscr, fp");
164 VFP_FMXR(,VFP_XREG_FPSCR,11);
167 asm("vfp_multiply_f32_loop2:");
169 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
170 VFP_FLDMIAS(CC_AL,1,8,8);
172 //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
173 VFP_FLDMIAS(CC_AL,2,16,8);
175 //asm("fadds s24, s8, s16");
176 VFP_FMULS(CC_AL,24,8,16);
178 asm("subs ip, ip, #1");
179 asm("bne vfp_multiply_f32_loop2");
181 //asm("fmxr fpscr, lr");
182 VFP_FMXR(,VFP_XREG_FPSCR,14);
184 asm("vfp_multiply_f32_end:");
185 asm ("ldmia sp!, {fp, pc}");
189 EXPORT_C __NAKED__ void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n)
191 asm(" stmdb sp!, {fp, lr}");
192 asm("ands ip, r3, #7");
193 asm("beq vfp_subtract_f32_unroll");
195 //asm("fldmias r1!, {s0}");
196 VFP_FLDMIAS(CC_AL,1,0,1);
198 asm("vfp_subtract_f32_loop1:");
200 //asm("fldmias r2!, {s1}");
201 VFP_FLDMIAS(CC_AL,2,1,1);
203 //asm("fadds s2, s0, s1");
204 VFP_FSUBS(CC_AL,2,0,1);
206 //asm("fstmias r0!, {s2}");
207 VFP_FSTMIAS(CC_AL,0,2,1);
209 asm("subs ip, ip, #1");
210 asm("bne vfp_subtract_f32_loop1");
211 asm("vfp_subtract_f32_unroll: movs ip, r3, lsr #3");
212 asm("beq vfp_subtract_f32_end");
215 //asm("fmrx lr, fpscr");
216 VFP_FMRX(,14,VFP_XREG_FPSCR);
220 asm("orr fp, lr, fp, lsl #16");
222 //asm("fmxr fpscr, fp");
223 VFP_FMXR(,VFP_XREG_FPSCR,11);
226 asm("vfp_subtract_f32_loop2:");
228 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
229 VFP_FLDMIAS(CC_AL,1,8,8);
231 //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
232 VFP_FLDMIAS(CC_AL,2,16,8);
234 //asm("fadds s24, s8, s16");
235 VFP_FSUBS(CC_AL,24,8,16);
237 asm("subs ip, ip, #1");
238 asm("bne vfp_subtract_f32_loop2");
240 //asm("fmxr fpscr, lr");
241 VFP_FMXR(,VFP_XREG_FPSCR,14);
243 asm("vfp_subtract_f32_end:");
244 asm ("ldmia sp!, {fp, pc}");
248 EXPORT_C __NAKED__ void vfp_add_f64 (double *d, const double *s1, const double *s2, int n)
250 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
251 asm("ands ip, r3, #3"); /* ip = n % 3 */
252 asm("beq vfp_add_f64_unroll"); /* if ip == 0 goto prep_loop2 */
253 asm("vfp_add_f64_loop1:");
255 //asm("fldmiad r1!, {d0}");
256 VFP_FLDMIAD(CC_AL,1,0,1);
258 //asm("fldmiad r2!, {d1}");
259 VFP_FLDMIAD(CC_AL,2,1,1);
261 //asm("faddd d2, d0, d1");
264 //asm("fstmiad r0!, {d2}");
265 VFP_FSTMIAD(CC_AL,0,2,1);
267 asm("subs ip, ip, #1");
268 asm("bne vfp_add_f64_loop1");
269 asm("vfp_add_f64_unroll:"); /* unroll by 4 */
270 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
271 asm(" beq vfp_add_f64_end"); /* if ip == 0 goto finish */
273 //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */
274 VFP_FMRX(,14,VFP_XREG_FPSCR);
277 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
279 //asm("fmxr fpscr, fp");
280 VFP_FMXR(,VFP_XREG_FPSCR,11);
282 asm("vfp_add_f64_loop2:");
284 //asm("fldmiad r1!, {d4, d5, d6, d7}");
285 VFP_FLDMIAS(CC_AL,1,4,4);
287 //asm("fldmiad r2!, {d8, d9, d10, d11}");
288 VFP_FLDMIAS(CC_AL,2,8,4);
290 //asm("faddd d12, d4, d8");
293 //asm("fstmiad r0!, {d12, d13, d14, d15}");
294 VFP_FSTMIAS(CC_AL,0,12,4);
296 asm("subs ip, ip, #1");
297 asm("bne vfp_add_f64_loop2");
299 //asm("fmxr fpscr, lr"); /* restore original fpscr */
300 VFP_FMXR(,VFP_XREG_FPSCR,14);
302 asm("vfp_add_f64_end:");
303 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
309 EXPORT_C __NAKED__ void vfp_abs_f32_f32_ns(float *d, const float *s, int n)
311 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
312 asm("ands ip, r2, #7"); /* ip = n % 8 */
313 asm("beq vfp_abs_f32_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */
314 asm("vfp_abs_f32_f32_ns_loop1:");
316 //asm("fldmias r1!, {s0}");
317 VFP_FLDMIAS(CC_AL,1,0,1);
319 //asm("fabss s2, s0");
320 VFP_FABSS(CC_AL,2,0);
322 //asm("fstmias r0!, {s2}");
323 VFP_FSTMIAS(CC_AL,0,2,1);
325 asm("subs ip, ip, #1");
326 asm("bne vfp_abs_f32_f32_ns_loop1");
327 asm("vfp_abs_f32_f32_ns_unroll:"); /* unroll by 8 */
328 asm("movs ip, r2, lsr #3"); /* ip = n / 8 */
329 asm("beq vfp_abs_f32_f32_ns_end"); /* if ip == 0 goto finish */
331 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */
332 VFP_FMRX(,14,VFP_XREG_FPSCR);
335 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
337 //asm("fmxr fpscr, fp");
338 VFP_FMXR(,VFP_XREG_FPSCR,11);
340 asm("vfp_abs_f32_f32_ns_loop2:");
342 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
343 VFP_FLDMIAS(CC_AL,1,8,8);
345 //asm("fabss s24, s8");
346 VFP_FABSS(CC_AL,2,0);
348 //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");
349 VFP_FSTMIAS(CC_AL,0,24,8);
351 asm("subs ip, ip, #1");
352 asm("bne vfp_abs_f32_f32_ns_loop2");
354 //asm("fmxr fpscr, lr"); /* restore original fpscr */
355 VFP_FMXR(,VFP_XREG_FPSCR,14);
357 asm("vfp_abs_f32_f32_ns_end:");
358 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
361 EXPORT_C __NAKED__ void vfp_negative_f32(float *d, const float *s, int n)
363 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
364 asm("ands ip, r2, #7"); /* ip = n % 8 */
365 asm("beq vfp_negative_f32_unroll"); /* if ip == 0 goto prep_loop2 */
366 asm("vfp_negative_f32_loop1:");
368 //asm("fldmias r1!, {s0}");
369 VFP_FLDMIAS(CC_AL,1,0,1);
371 //asm("fnegs s2, s0");
372 VFP_FNEGS(CC_AL,2,0);
374 //asm("fstmias r0!, {s2}");
375 VFP_FSTMIAS(CC_AL,0,2,1);
377 asm("subs ip, ip, #1");
378 asm("bne vfp_negative_f32_loop1");
379 asm("vfp_negative_f32_unroll:"); /* unroll by 8 */
380 asm("movs ip, r2, lsr #3"); /* ip = n / 8 */
381 asm("beq vfp_negative_f32_end"); /* if ip == 0 goto finish */
383 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */
384 VFP_FMRX(,14,VFP_XREG_FPSCR);
387 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
389 // asm("fmxr fpscr, fp");
390 VFP_FMXR(,VFP_XREG_FPSCR,11);
392 asm("vfp_negative_f32_loop2:");
394 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
395 VFP_FLDMIAS(CC_AL,1,8,8);
397 //asm("fnegs s24, s8");
398 VFP_FNEGS(CC_AL,2,0);
400 //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");
401 VFP_FSTMIAS(CC_AL,0,24,8);
403 asm("subs ip, ip, #1");
404 asm("bne vfp_negative_f32_loop2");
406 //asm("fmxr fpscr, lr"); /* restore original fpscr */
407 VFP_FMXR(,VFP_XREG_FPSCR,14);
409 asm("vfp_negative_f32_end:");
410 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
413 EXPORT_C __NAKED__ void vfp_abs_f64_f64_ns(double *d, const double *s, int n)
415 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
416 asm("ands ip, r2, #3"); /* ip = n % 3 */
417 asm("beq vfp_abs_f64_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */
418 asm("vfp_abs_f64_f64_ns_loop1:");
420 //asm("fldmiad r1!, {d0}");
421 VFP_FLDMIAD(CC_AL,1,0,1);
423 //asm("fabsd d2, d0");
426 //asm("fstmiad r0!, {d2}");
427 VFP_FSTMIAD(CC_AL,0,2,1);
429 asm("subs ip, ip, #1");
430 asm("bne vfp_abs_f64_f64_ns_loop1");
431 asm("vfp_abs_f64_f64_ns_unroll:"); /* unroll by 4 */
432 asm("movs ip, r2, lsr #2"); /* ip = n / 4 */
433 asm("beq vfp_abs_f64_f64_ns_end"); /* if ip == 0 goto finish */
435 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */
436 VFP_FMRX(,14,VFP_XREG_FPSCR);
439 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */
441 //asm("fmxr fpscr, fp");
442 VFP_FMXR(,VFP_XREG_FPSCR,11);
444 asm("vfp_abs_f64_f64_ns_loop2:");
447 //asm("fldmiad r1!, {d4, d5, d6, d7}");
448 VFP_FLDMIAD(CC_AL,1,4,4);
450 //asm("fabsd d12, d4");
453 //asm("fstmiad r0!, {d12, d13, d14, d15}");
454 VFP_FSTMIAD(CC_AL,0,12,4);
456 asm("subs ip, ip, #1");
457 asm("bne vfp_abs_f64_f64_ns_loop2");
459 // asm("fmxr fpscr, lr"); /* restore original fpscr */
460 VFP_FMXR(,VFP_XREG_FPSCR,14);
462 asm("vfp_abs_f64_f64_ns_end:");
463 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
467 EXPORT_C __NAKED__ void vfp_negative_f64(double *d, const double *s, int n)
469 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
470 asm("ands ip, r2, #3"); /* ip = n % 3 */
471 asm("beq vfp_negative_f64_unroll"); /* if ip == 0 goto prep_loop2 */
472 asm("vfp_negative_f64_loop1:");
474 //asm("fldmiad r1!, {d0}");
475 VFP_FLDMIAD(CC_AL,1,0,1);
477 //asm("fnegd d2, d0");
480 //asm("fstmiad r0!, {d2}");
481 VFP_FSTMIAD(CC_AL,0,2,1);
483 asm("subs ip, ip, #1");
484 asm("bne vfp_negative_f64_loop1");
485 asm("vfp_negative_f64_unroll:"); /* unroll by 4 */
486 asm("movs ip, r2, lsr #2"); /* ip = n / 4 */
487 asm("beq vfp_negative_f64_end"); /* if ip == 0 goto finish */
489 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */
490 VFP_FMRX(,14,VFP_XREG_FPSCR);
493 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */
495 //asm("fmxr fpscr, fp");
496 VFP_FMXR(,VFP_XREG_FPSCR,11);
498 asm("vfp_negative_f64_loop2:");
500 //asm("fldmiad r1!, {d4, d5, d6, d7}");
501 VFP_FLDMIAD(CC_AL,1,4,4);
503 //asm("fnegd d12, d4");
506 //asm("fstmiad r0!, {d12, d13, d14, d15}");
507 VFP_FSTMIAD(CC_AL,0,12,4);
509 asm("subs ip, ip, #1");
510 asm("bne vfp_negative_f64_loop2");
512 //asm("fmxr fpscr, lr"); /* restore original fpscr */
513 VFP_FMXR(,VFP_XREG_FPSCR,14);
515 asm("vfp_negative_f64_end:");
516 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
521 EXPORT_C __NAKED__ void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n)
523 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
524 asm("ands ip, r3, #3"); /* ip = n % 3 */
525 asm("beq vfp_divide_f64_unroll"); /* if ip == 0 goto prep_loop2 */
526 asm("vfp_divide_f64_loop1:");
528 //asm("fldmiad r1!, {d0}");
529 VFP_FLDMIAD(CC_AL,1,0,1);
531 //asm("fldmiad r2!, {d1}");
532 VFP_FLDMIAD(CC_AL,2,1,1);
534 //asm("faddd d2, d0, d1");
537 //asm("fstmiad r0!, {d2}");
538 VFP_FSTMIAD(CC_AL,0,2,1);
540 asm("subs ip, ip, #1");
541 asm("bne vfp_divide_f64_loop1");
542 asm("vfp_divide_f64_unroll:"); /* unroll by 4 */
543 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
544 asm(" beq vfp_divide_f64_end"); /* if ip == 0 goto finish */
546 //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */
547 VFP_FMRX(,14,VFP_XREG_FPSCR);
550 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
552 //asm("fmxr fpscr, fp");
553 VFP_FMXR(,VFP_XREG_FPSCR,11);
555 asm("vfp_divide_f64_loop2:");
557 //asm("fldmiad r1!, {d4, d5, d6, d7}");
558 VFP_FLDMIAS(CC_AL,1,4,4);
560 //asm("fldmiad r2!, {d8, d9, d10, d11}");
561 VFP_FLDMIAS(CC_AL,2,8,4);
563 //asm("faddd d12, d4, d8");
566 //asm("fstmiad r0!, {d12, d13, d14, d15}");
567 VFP_FSTMIAS(CC_AL,0,12,4);
569 asm("subs ip, ip, #1");
570 asm("bne vfp_divide_f64_loop2");
572 //asm("fmxr fpscr, lr"); /* restore original fpscr */
573 VFP_FMXR(,VFP_XREG_FPSCR,14);
575 asm("vfp_divide_f64_end:");
576 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
579 EXPORT_C __NAKED__ void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n)
581 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
582 asm("ands ip, r3, #3"); /* ip = n % 3 */
583 asm("beq vfp_multiply_f64_unroll"); /* if ip == 0 goto prep_loop2 */
584 asm("vfp_multiply_f64_loop1:");
586 //asm("fldmiad r1!, {d0}");
587 VFP_FLDMIAD(CC_AL,1,0,1);
589 //asm("fldmiad r2!, {d1}");
590 VFP_FLDMIAD(CC_AL,2,1,1);
592 //asm("faddd d2, d0, d1");
595 //asm("fstmiad r0!, {d2}");
596 VFP_FSTMIAD(CC_AL,0,2,1);
598 asm("subs ip, ip, #1");
599 asm("bne vfp_multiply_f64_loop1");
600 asm("vfp_multiply_f64_unroll:"); /* unroll by 4 */
601 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
602 asm(" beq vfp_multiply_f64_end"); /* if ip == 0 goto finish */
604 //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */
605 VFP_FMRX(,14,VFP_XREG_FPSCR);
608 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
610 //asm("fmxr fpscr, fp");
611 VFP_FMXR(,VFP_XREG_FPSCR,11);
613 asm("vfp_multiply_f64_loop2:");
615 //asm("fldmiad r1!, {d4, d5, d6, d7}");
616 VFP_FLDMIAS(CC_AL,1,4,4);
618 //asm("fldmiad r2!, {d8, d9, d10, d11}");
619 VFP_FLDMIAS(CC_AL,2,8,4);
621 //asm("faddd d12, d4, d8");
624 //asm("fstmiad r0!, {d12, d13, d14, d15}");
625 VFP_FSTMIAS(CC_AL,0,12,4);
627 asm("subs ip, ip, #1");
628 asm("bne vfp_multiply_f64_loop2");
630 //asm("fmxr fpscr, lr"); /* restore original fpscr */
631 VFP_FMXR(,VFP_XREG_FPSCR,14);
633 asm("vfp_multiply_f64_end:");
634 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
637 EXPORT_C __NAKED__ void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n)
639 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
640 asm("ands ip, r3, #3"); /* ip = n % 3 */
641 asm("beq vfp_subtract_f64_unroll"); /* if ip == 0 goto prep_loop2 */
642 asm("vfp_subtract_f64_loop1:");
644 //asm("fldmiad r1!, {d0}");
645 VFP_FLDMIAD(CC_AL,1,0,1);
647 //asm("fldmiad r2!, {d1}");
648 VFP_FLDMIAD(CC_AL,2,1,1);
650 //asm("faddd d2, d0, d1");
653 //asm("fstmiad r0!, {d2}");
654 VFP_FSTMIAD(CC_AL,0,2,1);
656 asm("subs ip, ip, #1");
657 asm("bne vfp_subtract_f64_loop1");
658 asm("vfp_subtract_f64_unroll:"); /* unroll by 4 */
659 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
660 asm(" beq vfp_subtract_f64_end"); /* if ip == 0 goto finish */
662 //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */
663 VFP_FMRX(,14,VFP_XREG_FPSCR);
666 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
668 //asm("fmxr fpscr, fp");
669 VFP_FMXR(,VFP_XREG_FPSCR,11);
671 asm("vfp_subtract_f64_loop2:");
673 //asm("fldmiad r1!, {d4, d5, d6, d7}");
674 VFP_FLDMIAS(CC_AL,1,4,4);
676 //asm("fldmiad r2!, {d8, d9, d10, d11}");
677 VFP_FLDMIAS(CC_AL,2,8,4);
679 //asm("faddd d12, d4, d8");
682 //asm("fstmiad r0!, {d12, d13, d14, d15}");
683 VFP_FSTMIAS(CC_AL,0,12,4);
685 asm("subs ip, ip, #1");
686 asm("bne vfp_subtract_f64_loop2");
688 //asm("fmxr fpscr, lr"); /* restore original fpscr */
689 VFP_FMXR(,VFP_XREG_FPSCR,14);
691 asm("vfp_subtract_f64_end:");
692 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
695 EXPORT_C __NAKED__ void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n)
697 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
699 //asm("fldmias r2, {s1}"); /* load scalar value */
700 VFP_FLDMIAS(CC_AL,2,1,1);
702 asm("ands ip, r3, #7"); /* ip = n % 8 */
703 asm("beq vfp_scalaradd_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */
704 asm("vfp_scalaradd_f32_ns_loop1:");
706 //asm("fldmias r1!, {s0}");
707 VFP_FLDMIAS(CC_AL,1,0,1);
709 //asm("FADDS s2, s0, s1");
710 VFP_FADDS(CC_AL,2,0,1);
712 //asm("fstmias r0!, {s2}");
713 VFP_FSTMIAS(CC_AL,0,2,8);
715 asm("subs ip, ip, #1");
716 asm("bne vfp_scalaradd_f32_ns_loop1");
717 asm("vfp_scalaradd_f32_ns_unroll:"); /* unroll by 8 */
718 asm("movs ip, r3, lsr #3"); /* ip = n / 8 */
719 asm("beq vfp_scalaradd_f32_ns_end"); /* if ip == 0 goto finish */
721 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\
722 VFP_FMRX(,14,VFP_XREG_FPSCR);
725 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
727 //asm("fmxr fpscr, fp");
728 VFP_FMXR(,VFP_XREG_FPSCR,11);
730 asm("vfp_scalaradd_f32_ns_loop2:");
731 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
732 VFP_FLDMIAS(CC_AL,1,8,8);
734 //asm("FADDS s24, s8, s1");
735 VFP_FADDS(CC_AL,24,8,1);
737 //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");
738 VFP_FSTMIAS(CC_AL,0,24,8);
740 asm("subs ip, ip, #1");
741 asm("bne vfp_scalaradd_f32_ns_loop2");
743 //asm("fmxr fpscr, lr"); /* restore original fpscr */
744 VFP_FMXR(,VFP_XREG_FPSCR,14);
746 asm("vfp_scalaradd_f32_ns_end:");
747 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
750 EXPORT_C __NAKED__ void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n)
752 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
754 //asm("fldmias r2, {s1}"); /* load scalar value */
755 VFP_FLDMIAS(CC_AL,2,1,1);
757 asm("ands ip, r3, #7"); /* ip = n % 8 */
758 asm("beq vfp_scalarmultiply_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */
759 asm("vfp_scalarmultiply_f32_ns_loop1:");
761 //asm("fldmias r1!, {s0}");
762 VFP_FLDMIAS(CC_AL,1,0,1);
764 //asm("FADDS s2, s0, s1");
765 VFP_FMULS(CC_AL,2,0,1);
767 //asm("fstmias r0!, {s2}");
768 VFP_FSTMIAS(CC_AL,0,2,8);
770 asm("subs ip, ip, #1");
771 asm("bne vfp_scalarmultiply_f32_ns_loop1");
772 asm("vfp_scalarmultiply_f32_ns_unroll:"); /* unroll by 8 */
773 asm("movs ip, r3, lsr #3"); /* ip = n / 8 */
774 asm("beq vfp_scalarmultiply_f32_ns_end"); /* if ip == 0 goto finish */
776 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\
777 VFP_FMRX(,14,VFP_XREG_FPSCR);
780 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
782 //asm("fmxr fpscr, fp");
783 VFP_FMXR(,VFP_XREG_FPSCR,11);
785 asm("vfp_scalarmultiply_f32_ns_loop2:");
786 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
787 VFP_FLDMIAS(CC_AL,1,8,8);
789 //asm("FADDS s24, s8, s1");
790 VFP_FMULS(CC_AL,24,8,1);
792 //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");
793 VFP_FSTMIAS(CC_AL,0,24,8);
795 asm("subs ip, ip, #1");
796 asm("bne vfp_scalarmultiply_f32_ns_loop2");
798 //asm("fmxr fpscr, lr"); /* restore original fpscr */
799 VFP_FMXR(,VFP_XREG_FPSCR,14);
801 asm("vfp_scalarmultiply_f32_ns_end:");
802 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
805 EXPORT_C __NAKED__ void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n)
807 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
809 //asm("fldmiad r2, {d1}"); /* load scalar value */
810 VFP_FLDMIAD(CC_AL,2,1,1);
812 asm("ands ip, r3, #3"); /* ip = n % 3 */
813 asm("beq vfp_scalaradd_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */
814 asm("vfp_scalaradd_f64_ns_loop1:");
815 //asm("fldmiad r1!, {d0}");
816 VFP_FLDMIAD(CC_AL,1,0,1);
818 //asm("VFP_FADDD d2, d0, d1");
821 //asm("fstmiad r0!, {d2}");
822 VFP_FSTMIAD(CC_AL,0,2,1);
824 asm("subs ip, ip, #1");
825 asm("bne vfp_scalaradd_f64_ns_loop1");
826 asm("vfp_scalaradd_f64_ns_unroll:"); /* unroll by 4 */
827 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
828 asm("beq vfp_scalaradd_f64_ns_end"); /* if ip == 0 goto finish */
830 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\
831 VFP_FMRX(,14,VFP_XREG_FPSCR);
834 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */
836 //asm("fmxr fpscr, fp");
837 VFP_FMXR(,VFP_XREG_FPSCR,11);
839 asm("vfp_scalaradd_f64_ns_loop2:");
841 //asm("fldmiad r1!, {d4, d5, d6, d7}");
842 VFP_FLDMIAD(CC_AL,1,4,4);
844 //asm("VFP_FADDD d12, d4, d1");
847 //asm("fstmiad r0!, {d12, d13, d14, d15}");
848 VFP_FSTMIAD(CC_AL,0,12,4);
850 asm("subs ip, ip, #1");
851 asm("bne vfp_scalaradd_f64_ns_loop2");
853 //asm("fmxr fpscr, lr"); /* restore original fpscr */
854 VFP_FMXR(,VFP_XREG_FPSCR,14);
856 asm("vfp_scalaradd_f64_ns_end:");
857 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
860 EXPORT_C __NAKED__ void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n)
863 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
865 //asm("fldmiad r2, {d1}"); /* load scalar value */
866 VFP_FLDMIAD(CC_AL,2,1,1);
868 asm("ands ip, r3, #3"); /* ip = n % 3 */
869 asm("beq vfp_scalarmultiply_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */
870 asm("vfp_scalarmultiply_f64_ns_loop1:");
871 //asm("fldmiad r1!, {d0}");
872 VFP_FLDMIAD(CC_AL,1,0,1);
874 //asm("VFP_FADDD d2, d0, d1");
877 //asm("fstmiad r0!, {d2}");
878 VFP_FSTMIAD(CC_AL,0,2,1);
880 asm("subs ip, ip, #1");
881 asm("bne vfp_scalarmultiply_f64_ns_loop1");
882 asm("vfp_scalarmultiply_f64_ns_unroll:"); /* unroll by 4 */
883 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
884 asm("beq vfp_scalarmultiply_f64_ns_end"); /* if ip == 0 goto finish */
886 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\
887 VFP_FMRX(,14,VFP_XREG_FPSCR);
890 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */
892 //asm("fmxr fpscr, fp");
893 VFP_FMXR(,VFP_XREG_FPSCR,11);
895 asm("vfp_scalarmultiply_f64_ns_loop2:");
897 //asm("fldmiad r1!, {d4, d5, d6, d7}");
898 VFP_FLDMIAD(CC_AL,1,4,4);
900 //asm("VFP_FADDD d12, d4, d1");
903 //asm("fstmiad r0!, {d12, d13, d14, d15}");
904 VFP_FSTMIAD(CC_AL,0,12,4);
906 asm("subs ip, ip, #1");
907 asm("bne vfp_scalarmultiply_f64_ns_loop2");
909 //asm("fmxr fpscr, lr"); /* restore original fpscr */
910 VFP_FMXR(,VFP_XREG_FPSCR,14);
912 asm("vfp_scalarmultiply_f64_ns_end:");
913 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */