sl@0: #if __ARMCC__ sl@0: sl@0: #define __CPU_ARM sl@0: #define __CPU_HAS_VFP sl@0: #include sl@0: #include sl@0: sl@0: sl@0: extern "C" { sl@0: sl@0: EXPORT_C __NAKED__ void vfp_add_f32 (float *d, const float *s1, const float *s2, int n) sl@0: { sl@0: asm(" stmdb sp!, {fp, lr}"); sl@0: asm("ands ip, r3, #7"); sl@0: asm("beq vfp_add_f32_unroll"); sl@0: sl@0: //asm("fldmias r1!, {s0}"); sl@0: VFP_FLDMIAS(CC_AL,1,0,1); sl@0: sl@0: asm("vfp_add_f32_loop1: "); sl@0: sl@0: //asm("fldmias r2!, {s1}"); sl@0: VFP_FLDMIAS(CC_AL,2,1,1); sl@0: sl@0: //asm("fadds s2, s0, s1"); sl@0: VFP_FADDS(CC_AL,2,0,1); sl@0: sl@0: //asm("fstmias r0!, {s2}"); sl@0: VFP_FSTMIAS(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_add_f32_loop1 "); sl@0: asm("vfp_add_f32_unroll: movs ip, r3, lsr #3"); sl@0: asm("beq vfp_add_f32_end"); sl@0: sl@0: sl@0: //asm("fmrx lr, fpscr"); sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: sl@0: asm("mov fp, #7"); sl@0: asm("orr fp, lr, fp, lsl #16"); sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: sl@0: asm("vfp_add_f32_loop2:"); sl@0: sl@0: //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); sl@0: VFP_FLDMIAS(CC_AL,1,8,8); sl@0: sl@0: //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}"); sl@0: VFP_FLDMIAS(CC_AL,2,16,8); sl@0: sl@0: //asm("fadds s24, s8, s16"); sl@0: VFP_FADDS(CC_AL,24,8,16); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_add_f32_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_add_f32_end:"); sl@0: asm ("ldmia sp!, {fp, pc}"); sl@0: sl@0: } sl@0: sl@0: sl@0: EXPORT_C __NAKED__ void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n) sl@0: { sl@0: asm(" stmdb sp!, {fp, lr}"); sl@0: asm("ands ip, r3, #7"); sl@0: asm("beq vfp_divide_f32_unroll"); sl@0: sl@0: //asm("fldmias r1!, {s0}"); sl@0: VFP_FLDMIAS(CC_AL,1,0,1); sl@0: sl@0: asm("vfp_divide_f32_loop1:"); sl@0: sl@0: //asm("fldmias r2!, {s1}"); sl@0: VFP_FLDMIAS(CC_AL,2,1,1); sl@0: sl@0: //asm("fadds s2, s0, s1"); sl@0: VFP_FDIVS(CC_AL,2,0,1); sl@0: sl@0: //asm("fstmias r0!, {s2}"); sl@0: VFP_FSTMIAS(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_divide_f32_loop1"); sl@0: asm("vfp_divide_f32_unroll: movs ip, r3, lsr #3"); sl@0: asm("beq vfp_divide_f32_end"); sl@0: sl@0: sl@0: //asm("fmrx lr, fpscr"); sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: sl@0: asm("mov fp, #7"); sl@0: asm("orr fp, lr, fp, lsl #16"); sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: sl@0: asm("vfp_divide_f32_loop2:"); sl@0: sl@0: //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); sl@0: VFP_FLDMIAS(CC_AL,1,8,8); sl@0: sl@0: //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}"); sl@0: VFP_FLDMIAS(CC_AL,2,16,8); sl@0: sl@0: //asm("fadds s24, s8, s16"); sl@0: VFP_FDIVS(CC_AL,24,8,16); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_divide_f32_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_divide_f32_end:"); sl@0: asm ("ldmia sp!, {fp, pc}"); sl@0: sl@0: } sl@0: sl@0: EXPORT_C __NAKED__ void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n) sl@0: { sl@0: asm(" stmdb sp!, {fp, lr}"); sl@0: asm("ands ip, r3, #7"); sl@0: asm("beq vfp_multiply_f32_unroll"); sl@0: sl@0: //asm("fldmias r1!, {s0}"); sl@0: VFP_FLDMIAS(CC_AL,1,0,1); sl@0: sl@0: asm("vfp_multiply_f32_loop1:"); sl@0: sl@0: //asm("fldmias r2!, {s1}"); sl@0: VFP_FLDMIAS(CC_AL,2,1,1); sl@0: sl@0: //asm("fadds s2, s0, s1"); sl@0: VFP_FMULS(CC_AL,2,0,1); sl@0: sl@0: //asm("fstmias r0!, {s2}"); sl@0: VFP_FSTMIAS(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_multiply_f32_loop1"); sl@0: asm("vfp_multiply_f32_unroll: movs ip, r3, lsr #3"); sl@0: asm("beq vfp_multiply_f32_end"); sl@0: sl@0: sl@0: //asm("fmrx lr, fpscr"); sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: sl@0: asm("mov fp, #7"); sl@0: asm("orr fp, lr, fp, lsl #16"); sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: sl@0: asm("vfp_multiply_f32_loop2:"); sl@0: sl@0: //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); sl@0: VFP_FLDMIAS(CC_AL,1,8,8); sl@0: sl@0: //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}"); sl@0: VFP_FLDMIAS(CC_AL,2,16,8); sl@0: sl@0: //asm("fadds s24, s8, s16"); sl@0: VFP_FMULS(CC_AL,24,8,16); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_multiply_f32_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_multiply_f32_end:"); sl@0: asm ("ldmia sp!, {fp, pc}"); sl@0: sl@0: } sl@0: sl@0: EXPORT_C __NAKED__ void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n) sl@0: { sl@0: asm(" stmdb sp!, {fp, lr}"); sl@0: asm("ands ip, r3, #7"); sl@0: asm("beq vfp_subtract_f32_unroll"); sl@0: sl@0: //asm("fldmias r1!, {s0}"); sl@0: VFP_FLDMIAS(CC_AL,1,0,1); sl@0: sl@0: asm("vfp_subtract_f32_loop1:"); sl@0: sl@0: //asm("fldmias r2!, {s1}"); sl@0: VFP_FLDMIAS(CC_AL,2,1,1); sl@0: sl@0: //asm("fadds s2, s0, s1"); sl@0: VFP_FSUBS(CC_AL,2,0,1); sl@0: sl@0: //asm("fstmias r0!, {s2}"); sl@0: VFP_FSTMIAS(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_subtract_f32_loop1"); sl@0: asm("vfp_subtract_f32_unroll: movs ip, r3, lsr #3"); sl@0: asm("beq vfp_subtract_f32_end"); sl@0: sl@0: sl@0: //asm("fmrx lr, fpscr"); sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: sl@0: asm("mov fp, #7"); sl@0: asm("orr fp, lr, fp, lsl #16"); sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: sl@0: asm("vfp_subtract_f32_loop2:"); sl@0: sl@0: //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); sl@0: VFP_FLDMIAS(CC_AL,1,8,8); sl@0: sl@0: //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}"); sl@0: VFP_FLDMIAS(CC_AL,2,16,8); sl@0: sl@0: //asm("fadds s24, s8, s16"); sl@0: VFP_FSUBS(CC_AL,24,8,16); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_subtract_f32_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_subtract_f32_end:"); sl@0: asm ("ldmia sp!, {fp, pc}"); sl@0: sl@0: } sl@0: sl@0: EXPORT_C __NAKED__ void vfp_add_f64 (double *d, const double *s1, const double *s2, int n) sl@0: { sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: asm("ands ip, r3, #3"); /* ip = n % 3 */ sl@0: asm("beq vfp_add_f64_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_add_f64_loop1:"); sl@0: sl@0: //asm("fldmiad r1!, {d0}"); sl@0: VFP_FLDMIAD(CC_AL,1,0,1); sl@0: sl@0: //asm("fldmiad r2!, {d1}"); sl@0: VFP_FLDMIAD(CC_AL,2,1,1); sl@0: sl@0: //asm("faddd d2, d0, d1"); sl@0: VFP_FADDD(,2,0,1); sl@0: sl@0: //asm("fstmiad r0!, {d2}"); sl@0: VFP_FSTMIAD(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_add_f64_loop1"); sl@0: asm("vfp_add_f64_unroll:"); /* unroll by 4 */ sl@0: asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ sl@0: asm(" beq vfp_add_f64_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #3"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_add_f64_loop2:"); sl@0: sl@0: //asm("fldmiad r1!, {d4, d5, d6, d7}"); sl@0: VFP_FLDMIAS(CC_AL,1,4,4); sl@0: sl@0: //asm("fldmiad r2!, {d8, d9, d10, d11}"); sl@0: VFP_FLDMIAS(CC_AL,2,8,4); sl@0: sl@0: //asm("faddd d12, d4, d8"); sl@0: VFP_FADDD(,12,4,8); sl@0: sl@0: //asm("fstmiad r0!, {d12, d13, d14, d15}"); sl@0: VFP_FSTMIAS(CC_AL,0,12,4); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_add_f64_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_add_f64_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: } sl@0: sl@0: sl@0: sl@0: sl@0: EXPORT_C __NAKED__ void vfp_abs_f32_f32_ns(float *d, const float *s, int n) sl@0: { sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: asm("ands ip, r2, #7"); /* ip = n % 8 */ sl@0: asm("beq vfp_abs_f32_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_abs_f32_f32_ns_loop1:"); sl@0: sl@0: //asm("fldmias r1!, {s0}"); sl@0: VFP_FLDMIAS(CC_AL,1,0,1); sl@0: sl@0: //asm("fabss s2, s0"); sl@0: VFP_FABSS(CC_AL,2,0); sl@0: sl@0: //asm("fstmias r0!, {s2}"); sl@0: VFP_FSTMIAS(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_abs_f32_f32_ns_loop1"); sl@0: asm("vfp_abs_f32_f32_ns_unroll:"); /* unroll by 8 */ sl@0: asm("movs ip, r2, lsr #3"); /* ip = n / 8 */ sl@0: asm("beq vfp_abs_f32_f32_ns_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm("fmrx lr, fpscr"); /* read fpscr register into arm */ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #7"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_abs_f32_f32_ns_loop2:"); sl@0: sl@0: //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); sl@0: VFP_FLDMIAS(CC_AL,1,8,8); sl@0: sl@0: //asm("fabss s24, s8"); sl@0: VFP_FABSS(CC_AL,2,0); sl@0: sl@0: //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}"); sl@0: VFP_FSTMIAS(CC_AL,0,24,8); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_abs_f32_f32_ns_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_abs_f32_f32_ns_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: } sl@0: sl@0: EXPORT_C __NAKED__ void vfp_negative_f32(float *d, const float *s, int n) sl@0: { sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: asm("ands ip, r2, #7"); /* ip = n % 8 */ sl@0: asm("beq vfp_negative_f32_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_negative_f32_loop1:"); sl@0: sl@0: //asm("fldmias r1!, {s0}"); sl@0: VFP_FLDMIAS(CC_AL,1,0,1); sl@0: sl@0: //asm("fnegs s2, s0"); sl@0: VFP_FNEGS(CC_AL,2,0); sl@0: sl@0: //asm("fstmias r0!, {s2}"); sl@0: VFP_FSTMIAS(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_negative_f32_loop1"); sl@0: asm("vfp_negative_f32_unroll:"); /* unroll by 8 */ sl@0: asm("movs ip, r2, lsr #3"); /* ip = n / 8 */ sl@0: asm("beq vfp_negative_f32_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm("fmrx lr, fpscr"); /* read fpscr register into arm */ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #7"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ sl@0: sl@0: // asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_negative_f32_loop2:"); sl@0: sl@0: //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); sl@0: VFP_FLDMIAS(CC_AL,1,8,8); sl@0: sl@0: //asm("fnegs s24, s8"); sl@0: VFP_FNEGS(CC_AL,2,0); sl@0: sl@0: //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}"); sl@0: VFP_FSTMIAS(CC_AL,0,24,8); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_negative_f32_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_negative_f32_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: } sl@0: sl@0: EXPORT_C __NAKED__ void vfp_abs_f64_f64_ns(double *d, const double *s, int n) sl@0: { sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: asm("ands ip, r2, #3"); /* ip = n % 3 */ sl@0: asm("beq vfp_abs_f64_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_abs_f64_f64_ns_loop1:"); sl@0: sl@0: //asm("fldmiad r1!, {d0}"); sl@0: VFP_FLDMIAD(CC_AL,1,0,1); sl@0: sl@0: //asm("fabsd d2, d0"); sl@0: VFP_FABSD(,2,0); sl@0: sl@0: //asm("fstmiad r0!, {d2}"); sl@0: VFP_FSTMIAD(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_abs_f64_f64_ns_loop1"); sl@0: asm("vfp_abs_f64_f64_ns_unroll:"); /* unroll by 4 */ sl@0: asm("movs ip, r2, lsr #2"); /* ip = n / 4 */ sl@0: asm("beq vfp_abs_f64_f64_ns_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm("fmrx lr, fpscr"); /* read fpscr register into arm */ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #3"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */ sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_abs_f64_f64_ns_loop2:"); sl@0: sl@0: sl@0: //asm("fldmiad r1!, {d4, d5, d6, d7}"); sl@0: VFP_FLDMIAD(CC_AL,1,4,4); sl@0: sl@0: //asm("fabsd d12, d4"); sl@0: VFP_FABSD(,12,4); sl@0: sl@0: //asm("fstmiad r0!, {d12, d13, d14, d15}"); sl@0: VFP_FSTMIAD(CC_AL,0,12,4); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_abs_f64_f64_ns_loop2"); sl@0: sl@0: // asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_abs_f64_f64_ns_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: } sl@0: sl@0: sl@0: EXPORT_C __NAKED__ void vfp_negative_f64(double *d, const double *s, int n) sl@0: { sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: asm("ands ip, r2, #3"); /* ip = n % 3 */ sl@0: asm("beq vfp_negative_f64_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_negative_f64_loop1:"); sl@0: sl@0: //asm("fldmiad r1!, {d0}"); sl@0: VFP_FLDMIAD(CC_AL,1,0,1); sl@0: sl@0: //asm("fnegd d2, d0"); sl@0: VFP_FNEGD(,2,0); sl@0: sl@0: //asm("fstmiad r0!, {d2}"); sl@0: VFP_FSTMIAD(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_negative_f64_loop1"); sl@0: asm("vfp_negative_f64_unroll:"); /* unroll by 4 */ sl@0: asm("movs ip, r2, lsr #2"); /* ip = n / 4 */ sl@0: asm("beq vfp_negative_f64_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm("fmrx lr, fpscr"); /* read fpscr register into arm */ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #3"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */ sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_negative_f64_loop2:"); sl@0: sl@0: //asm("fldmiad r1!, {d4, d5, d6, d7}"); sl@0: VFP_FLDMIAD(CC_AL,1,4,4); sl@0: sl@0: //asm("fnegd d12, d4"); sl@0: VFP_FNEGD(,12,4); sl@0: sl@0: //asm("fstmiad r0!, {d12, d13, d14, d15}"); sl@0: VFP_FSTMIAD(CC_AL,0,12,4); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_negative_f64_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_negative_f64_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: } sl@0: sl@0: sl@0: //Rakhi changes sl@0: EXPORT_C __NAKED__ void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n) sl@0: { sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: asm("ands ip, r3, #3"); /* ip = n % 3 */ sl@0: asm("beq vfp_divide_f64_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_divide_f64_loop1:"); sl@0: sl@0: //asm("fldmiad r1!, {d0}"); sl@0: VFP_FLDMIAD(CC_AL,1,0,1); sl@0: sl@0: //asm("fldmiad r2!, {d1}"); sl@0: VFP_FLDMIAD(CC_AL,2,1,1); sl@0: sl@0: //asm("faddd d2, d0, d1"); sl@0: VFP_FDIVD(,2,0,1); sl@0: sl@0: //asm("fstmiad r0!, {d2}"); sl@0: VFP_FSTMIAD(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_divide_f64_loop1"); sl@0: asm("vfp_divide_f64_unroll:"); /* unroll by 4 */ sl@0: asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ sl@0: asm(" beq vfp_divide_f64_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #3"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_divide_f64_loop2:"); sl@0: sl@0: //asm("fldmiad r1!, {d4, d5, d6, d7}"); sl@0: VFP_FLDMIAS(CC_AL,1,4,4); sl@0: sl@0: //asm("fldmiad r2!, {d8, d9, d10, d11}"); sl@0: VFP_FLDMIAS(CC_AL,2,8,4); sl@0: sl@0: //asm("faddd d12, d4, d8"); sl@0: VFP_FDIVD(,12,4,8); sl@0: sl@0: //asm("fstmiad r0!, {d12, d13, d14, d15}"); sl@0: VFP_FSTMIAS(CC_AL,0,12,4); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_divide_f64_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_divide_f64_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: } sl@0: sl@0: EXPORT_C __NAKED__ void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n) sl@0: { sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: asm("ands ip, r3, #3"); /* ip = n % 3 */ sl@0: asm("beq vfp_multiply_f64_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_multiply_f64_loop1:"); sl@0: sl@0: //asm("fldmiad r1!, {d0}"); sl@0: VFP_FLDMIAD(CC_AL,1,0,1); sl@0: sl@0: //asm("fldmiad r2!, {d1}"); sl@0: VFP_FLDMIAD(CC_AL,2,1,1); sl@0: sl@0: //asm("faddd d2, d0, d1"); sl@0: VFP_FMULD(,2,0,1); sl@0: sl@0: //asm("fstmiad r0!, {d2}"); sl@0: VFP_FSTMIAD(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_multiply_f64_loop1"); sl@0: asm("vfp_multiply_f64_unroll:"); /* unroll by 4 */ sl@0: asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ sl@0: asm(" beq vfp_multiply_f64_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #3"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_multiply_f64_loop2:"); sl@0: sl@0: //asm("fldmiad r1!, {d4, d5, d6, d7}"); sl@0: VFP_FLDMIAS(CC_AL,1,4,4); sl@0: sl@0: //asm("fldmiad r2!, {d8, d9, d10, d11}"); sl@0: VFP_FLDMIAS(CC_AL,2,8,4); sl@0: sl@0: //asm("faddd d12, d4, d8"); sl@0: VFP_FMULD(,12,4,8); sl@0: sl@0: //asm("fstmiad r0!, {d12, d13, d14, d15}"); sl@0: VFP_FSTMIAS(CC_AL,0,12,4); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_multiply_f64_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_multiply_f64_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: } sl@0: sl@0: EXPORT_C __NAKED__ void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n) sl@0: { sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: asm("ands ip, r3, #3"); /* ip = n % 3 */ sl@0: asm("beq vfp_subtract_f64_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_subtract_f64_loop1:"); sl@0: sl@0: //asm("fldmiad r1!, {d0}"); sl@0: VFP_FLDMIAD(CC_AL,1,0,1); sl@0: sl@0: //asm("fldmiad r2!, {d1}"); sl@0: VFP_FLDMIAD(CC_AL,2,1,1); sl@0: sl@0: //asm("faddd d2, d0, d1"); sl@0: VFP_FSUBD(,2,0,1); sl@0: sl@0: //asm("fstmiad r0!, {d2}"); sl@0: VFP_FSTMIAD(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_subtract_f64_loop1"); sl@0: asm("vfp_subtract_f64_unroll:"); /* unroll by 4 */ sl@0: asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ sl@0: asm(" beq vfp_subtract_f64_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #3"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_subtract_f64_loop2:"); sl@0: sl@0: //asm("fldmiad r1!, {d4, d5, d6, d7}"); sl@0: VFP_FLDMIAS(CC_AL,1,4,4); sl@0: sl@0: //asm("fldmiad r2!, {d8, d9, d10, d11}"); sl@0: VFP_FLDMIAS(CC_AL,2,8,4); sl@0: sl@0: //asm("faddd d12, d4, d8"); sl@0: VFP_FSUBD(,12,4,8); sl@0: sl@0: //asm("fstmiad r0!, {d12, d13, d14, d15}"); sl@0: VFP_FSTMIAS(CC_AL,0,12,4); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_subtract_f64_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_subtract_f64_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: } sl@0: sl@0: EXPORT_C __NAKED__ void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n) sl@0: { sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: sl@0: //asm("fldmias r2, {s1}"); /* load scalar value */ sl@0: VFP_FLDMIAS(CC_AL,2,1,1); sl@0: sl@0: asm("ands ip, r3, #7"); /* ip = n % 8 */ sl@0: asm("beq vfp_scalaradd_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_scalaradd_f32_ns_loop1:"); sl@0: sl@0: //asm("fldmias r1!, {s0}"); sl@0: VFP_FLDMIAS(CC_AL,1,0,1); sl@0: sl@0: //asm("FADDS s2, s0, s1"); sl@0: VFP_FADDS(CC_AL,2,0,1); sl@0: sl@0: //asm("fstmias r0!, {s2}"); sl@0: VFP_FSTMIAS(CC_AL,0,2,8); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_scalaradd_f32_ns_loop1"); sl@0: asm("vfp_scalaradd_f32_ns_unroll:"); /* unroll by 8 */ sl@0: asm("movs ip, r3, lsr #3"); /* ip = n / 8 */ sl@0: asm("beq vfp_scalaradd_f32_ns_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #7"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_scalaradd_f32_ns_loop2:"); sl@0: //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); sl@0: VFP_FLDMIAS(CC_AL,1,8,8); sl@0: sl@0: //asm("FADDS s24, s8, s1"); sl@0: VFP_FADDS(CC_AL,24,8,1); sl@0: sl@0: //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}"); sl@0: VFP_FSTMIAS(CC_AL,0,24,8); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_scalaradd_f32_ns_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_scalaradd_f32_ns_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: } sl@0: sl@0: EXPORT_C __NAKED__ void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n) sl@0: { sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: sl@0: //asm("fldmias r2, {s1}"); /* load scalar value */ sl@0: VFP_FLDMIAS(CC_AL,2,1,1); sl@0: sl@0: asm("ands ip, r3, #7"); /* ip = n % 8 */ sl@0: asm("beq vfp_scalarmultiply_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_scalarmultiply_f32_ns_loop1:"); sl@0: sl@0: //asm("fldmias r1!, {s0}"); sl@0: VFP_FLDMIAS(CC_AL,1,0,1); sl@0: sl@0: //asm("FADDS s2, s0, s1"); sl@0: VFP_FMULS(CC_AL,2,0,1); sl@0: sl@0: //asm("fstmias r0!, {s2}"); sl@0: VFP_FSTMIAS(CC_AL,0,2,8); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_scalarmultiply_f32_ns_loop1"); sl@0: asm("vfp_scalarmultiply_f32_ns_unroll:"); /* unroll by 8 */ sl@0: asm("movs ip, r3, lsr #3"); /* ip = n / 8 */ sl@0: asm("beq vfp_scalarmultiply_f32_ns_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #7"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_scalarmultiply_f32_ns_loop2:"); sl@0: //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); sl@0: VFP_FLDMIAS(CC_AL,1,8,8); sl@0: sl@0: //asm("FADDS s24, s8, s1"); sl@0: VFP_FMULS(CC_AL,24,8,1); sl@0: sl@0: //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}"); sl@0: VFP_FSTMIAS(CC_AL,0,24,8); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_scalarmultiply_f32_ns_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_scalarmultiply_f32_ns_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: } sl@0: sl@0: EXPORT_C __NAKED__ void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n) sl@0: { sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: sl@0: //asm("fldmiad r2, {d1}"); /* load scalar value */ sl@0: VFP_FLDMIAD(CC_AL,2,1,1); sl@0: sl@0: asm("ands ip, r3, #3"); /* ip = n % 3 */ sl@0: asm("beq vfp_scalaradd_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_scalaradd_f64_ns_loop1:"); sl@0: //asm("fldmiad r1!, {d0}"); sl@0: VFP_FLDMIAD(CC_AL,1,0,1); sl@0: sl@0: //asm("VFP_FADDD d2, d0, d1"); sl@0: VFP_FADDD(,2,0,1); sl@0: sl@0: //asm("fstmiad r0!, {d2}"); sl@0: VFP_FSTMIAD(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_scalaradd_f64_ns_loop1"); sl@0: asm("vfp_scalaradd_f64_ns_unroll:"); /* unroll by 4 */ sl@0: asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ sl@0: asm("beq vfp_scalaradd_f64_ns_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #3"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */ sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_scalaradd_f64_ns_loop2:"); sl@0: sl@0: //asm("fldmiad r1!, {d4, d5, d6, d7}"); sl@0: VFP_FLDMIAD(CC_AL,1,4,4); sl@0: sl@0: //asm("VFP_FADDD d12, d4, d1"); sl@0: VFP_FADDD(,12,4,1); sl@0: sl@0: //asm("fstmiad r0!, {d12, d13, d14, d15}"); sl@0: VFP_FSTMIAD(CC_AL,0,12,4); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_scalaradd_f64_ns_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_scalaradd_f64_ns_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: } sl@0: sl@0: EXPORT_C __NAKED__ void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n) sl@0: { sl@0: sl@0: asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ sl@0: sl@0: //asm("fldmiad r2, {d1}"); /* load scalar value */ sl@0: VFP_FLDMIAD(CC_AL,2,1,1); sl@0: sl@0: asm("ands ip, r3, #3"); /* ip = n % 3 */ sl@0: asm("beq vfp_scalarmultiply_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */ sl@0: asm("vfp_scalarmultiply_f64_ns_loop1:"); sl@0: //asm("fldmiad r1!, {d0}"); sl@0: VFP_FLDMIAD(CC_AL,1,0,1); sl@0: sl@0: //asm("VFP_FADDD d2, d0, d1"); sl@0: VFP_FMULD(,2,0,1); sl@0: sl@0: //asm("fstmiad r0!, {d2}"); sl@0: VFP_FSTMIAD(CC_AL,0,2,1); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_scalarmultiply_f64_ns_loop1"); sl@0: asm("vfp_scalarmultiply_f64_ns_unroll:"); /* unroll by 4 */ sl@0: asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ sl@0: asm("beq vfp_scalarmultiply_f64_ns_end"); /* if ip == 0 goto finish */ sl@0: sl@0: //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\ sl@0: VFP_FMRX(,14,VFP_XREG_FPSCR); sl@0: sl@0: asm("mov fp, #3"); sl@0: asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */ sl@0: sl@0: //asm("fmxr fpscr, fp"); sl@0: VFP_FMXR(,VFP_XREG_FPSCR,11); sl@0: sl@0: asm("vfp_scalarmultiply_f64_ns_loop2:"); sl@0: sl@0: //asm("fldmiad r1!, {d4, d5, d6, d7}"); sl@0: VFP_FLDMIAD(CC_AL,1,4,4); sl@0: sl@0: //asm("VFP_FADDD d12, d4, d1"); sl@0: VFP_FMULD(,12,4,1); sl@0: sl@0: //asm("fstmiad r0!, {d12, d13, d14, d15}"); sl@0: VFP_FSTMIAD(CC_AL,0,12,4); sl@0: sl@0: asm("subs ip, ip, #1"); sl@0: asm("bne vfp_scalarmultiply_f64_ns_loop2"); sl@0: sl@0: //asm("fmxr fpscr, lr"); /* restore original fpscr */ sl@0: VFP_FMXR(,VFP_XREG_FPSCR,14); sl@0: sl@0: asm("vfp_scalarmultiply_f64_ns_end:"); sl@0: asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ sl@0: sl@0: } sl@0: sl@0: sl@0: } sl@0: #endif