1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/ossrv/genericopenlibs/liboil/src/arm/math_vfp_asm.cia Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,919 @@
1.4 +#if __ARMCC__
1.5 +
1.6 +#define __CPU_ARM
1.7 +#define __CPU_HAS_VFP
1.8 +#include <arm_vfp.h>
1.9 +#include <e32std.h>
1.10 +
1.11 +
1.12 +extern "C" {
1.13 +
1.14 +EXPORT_C __NAKED__ void vfp_add_f32 (float *d, const float *s1, const float *s2, int n)
1.15 + {
1.16 + asm(" stmdb sp!, {fp, lr}");
1.17 + asm("ands ip, r3, #7");
1.18 + asm("beq vfp_add_f32_unroll");
1.19 +
1.20 + //asm("fldmias r1!, {s0}");
1.21 + VFP_FLDMIAS(CC_AL,1,0,1);
1.22 +
1.23 + asm("vfp_add_f32_loop1: ");
1.24 +
1.25 + //asm("fldmias r2!, {s1}");
1.26 + VFP_FLDMIAS(CC_AL,2,1,1);
1.27 +
1.28 + //asm("fadds s2, s0, s1");
1.29 + VFP_FADDS(CC_AL,2,0,1);
1.30 +
1.31 + //asm("fstmias r0!, {s2}");
1.32 + VFP_FSTMIAS(CC_AL,0,2,1);
1.33 +
1.34 + asm("subs ip, ip, #1");
1.35 + asm("bne vfp_add_f32_loop1 ");
1.36 + asm("vfp_add_f32_unroll: movs ip, r3, lsr #3");
1.37 + asm("beq vfp_add_f32_end");
1.38 +
1.39 +
1.40 + //asm("fmrx lr, fpscr");
1.41 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.42 +
1.43 +
1.44 + asm("mov fp, #7");
1.45 + asm("orr fp, lr, fp, lsl #16");
1.46 +
1.47 + //asm("fmxr fpscr, fp");
1.48 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.49 +
1.50 +
1.51 + asm("vfp_add_f32_loop2:");
1.52 +
1.53 + //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
1.54 + VFP_FLDMIAS(CC_AL,1,8,8);
1.55 +
1.56 + //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
1.57 + VFP_FLDMIAS(CC_AL,2,16,8);
1.58 +
1.59 + //asm("fadds s24, s8, s16");
1.60 + VFP_FADDS(CC_AL,24,8,16);
1.61 +
1.62 + asm("subs ip, ip, #1");
1.63 + asm("bne vfp_add_f32_loop2");
1.64 +
1.65 + //asm("fmxr fpscr, lr");
1.66 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.67 +
1.68 + asm("vfp_add_f32_end:");
1.69 + asm ("ldmia sp!, {fp, pc}");
1.70 +
1.71 + }
1.72 +
1.73 +
1.74 +EXPORT_C __NAKED__ void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n)
1.75 + {
1.76 + asm(" stmdb sp!, {fp, lr}");
1.77 + asm("ands ip, r3, #7");
1.78 + asm("beq vfp_divide_f32_unroll");
1.79 +
1.80 + //asm("fldmias r1!, {s0}");
1.81 + VFP_FLDMIAS(CC_AL,1,0,1);
1.82 +
1.83 + asm("vfp_divide_f32_loop1:");
1.84 +
1.85 + //asm("fldmias r2!, {s1}");
1.86 + VFP_FLDMIAS(CC_AL,2,1,1);
1.87 +
1.88 + //asm("fadds s2, s0, s1");
1.89 + VFP_FDIVS(CC_AL,2,0,1);
1.90 +
1.91 + //asm("fstmias r0!, {s2}");
1.92 + VFP_FSTMIAS(CC_AL,0,2,1);
1.93 +
1.94 + asm("subs ip, ip, #1");
1.95 + asm("bne vfp_divide_f32_loop1");
1.96 + asm("vfp_divide_f32_unroll: movs ip, r3, lsr #3");
1.97 + asm("beq vfp_divide_f32_end");
1.98 +
1.99 +
1.100 + //asm("fmrx lr, fpscr");
1.101 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.102 +
1.103 +
1.104 + asm("mov fp, #7");
1.105 + asm("orr fp, lr, fp, lsl #16");
1.106 +
1.107 + //asm("fmxr fpscr, fp");
1.108 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.109 +
1.110 +
1.111 + asm("vfp_divide_f32_loop2:");
1.112 +
1.113 + //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
1.114 + VFP_FLDMIAS(CC_AL,1,8,8);
1.115 +
1.116 + //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
1.117 + VFP_FLDMIAS(CC_AL,2,16,8);
1.118 +
1.119 + //asm("fadds s24, s8, s16");
1.120 + VFP_FDIVS(CC_AL,24,8,16);
1.121 +
1.122 + asm("subs ip, ip, #1");
1.123 + asm("bne vfp_divide_f32_loop2");
1.124 +
1.125 + //asm("fmxr fpscr, lr");
1.126 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.127 +
1.128 + asm("vfp_divide_f32_end:");
1.129 + asm ("ldmia sp!, {fp, pc}");
1.130 +
1.131 + }
1.132 +
1.133 +EXPORT_C __NAKED__ void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n)
1.134 + {
1.135 + asm(" stmdb sp!, {fp, lr}");
1.136 + asm("ands ip, r3, #7");
1.137 + asm("beq vfp_multiply_f32_unroll");
1.138 +
1.139 + //asm("fldmias r1!, {s0}");
1.140 + VFP_FLDMIAS(CC_AL,1,0,1);
1.141 +
1.142 + asm("vfp_multiply_f32_loop1:");
1.143 +
1.144 + //asm("fldmias r2!, {s1}");
1.145 + VFP_FLDMIAS(CC_AL,2,1,1);
1.146 +
1.147 + //asm("fadds s2, s0, s1");
1.148 + VFP_FMULS(CC_AL,2,0,1);
1.149 +
1.150 + //asm("fstmias r0!, {s2}");
1.151 + VFP_FSTMIAS(CC_AL,0,2,1);
1.152 +
1.153 + asm("subs ip, ip, #1");
1.154 + asm("bne vfp_multiply_f32_loop1");
1.155 + asm("vfp_multiply_f32_unroll: movs ip, r3, lsr #3");
1.156 + asm("beq vfp_multiply_f32_end");
1.157 +
1.158 +
1.159 + //asm("fmrx lr, fpscr");
1.160 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.161 +
1.162 +
1.163 + asm("mov fp, #7");
1.164 + asm("orr fp, lr, fp, lsl #16");
1.165 +
1.166 + //asm("fmxr fpscr, fp");
1.167 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.168 +
1.169 +
1.170 + asm("vfp_multiply_f32_loop2:");
1.171 +
1.172 + //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
1.173 + VFP_FLDMIAS(CC_AL,1,8,8);
1.174 +
1.175 + //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
1.176 + VFP_FLDMIAS(CC_AL,2,16,8);
1.177 +
1.178 + //asm("fadds s24, s8, s16");
1.179 + VFP_FMULS(CC_AL,24,8,16);
1.180 +
1.181 + asm("subs ip, ip, #1");
1.182 + asm("bne vfp_multiply_f32_loop2");
1.183 +
1.184 + //asm("fmxr fpscr, lr");
1.185 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.186 +
1.187 + asm("vfp_multiply_f32_end:");
1.188 + asm ("ldmia sp!, {fp, pc}");
1.189 +
1.190 + }
1.191 +
1.192 +EXPORT_C __NAKED__ void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n)
1.193 + {
1.194 + asm(" stmdb sp!, {fp, lr}");
1.195 + asm("ands ip, r3, #7");
1.196 + asm("beq vfp_subtract_f32_unroll");
1.197 +
1.198 + //asm("fldmias r1!, {s0}");
1.199 + VFP_FLDMIAS(CC_AL,1,0,1);
1.200 +
1.201 + asm("vfp_subtract_f32_loop1:");
1.202 +
1.203 + //asm("fldmias r2!, {s1}");
1.204 + VFP_FLDMIAS(CC_AL,2,1,1);
1.205 +
1.206 + //asm("fadds s2, s0, s1");
1.207 + VFP_FSUBS(CC_AL,2,0,1);
1.208 +
1.209 + //asm("fstmias r0!, {s2}");
1.210 + VFP_FSTMIAS(CC_AL,0,2,1);
1.211 +
1.212 + asm("subs ip, ip, #1");
1.213 + asm("bne vfp_subtract_f32_loop1");
1.214 + asm("vfp_subtract_f32_unroll: movs ip, r3, lsr #3");
1.215 + asm("beq vfp_subtract_f32_end");
1.216 +
1.217 +
1.218 + //asm("fmrx lr, fpscr");
1.219 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.220 +
1.221 +
1.222 + asm("mov fp, #7");
1.223 + asm("orr fp, lr, fp, lsl #16");
1.224 +
1.225 + //asm("fmxr fpscr, fp");
1.226 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.227 +
1.228 +
1.229 + asm("vfp_subtract_f32_loop2:");
1.230 +
1.231 + //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
1.232 + VFP_FLDMIAS(CC_AL,1,8,8);
1.233 +
1.234 + //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
1.235 + VFP_FLDMIAS(CC_AL,2,16,8);
1.236 +
1.237 + //asm("fadds s24, s8, s16");
1.238 + VFP_FSUBS(CC_AL,24,8,16);
1.239 +
1.240 + asm("subs ip, ip, #1");
1.241 + asm("bne vfp_subtract_f32_loop2");
1.242 +
1.243 + //asm("fmxr fpscr, lr");
1.244 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.245 +
1.246 + asm("vfp_subtract_f32_end:");
1.247 + asm ("ldmia sp!, {fp, pc}");
1.248 +
1.249 + }
1.250 +
1.251 +EXPORT_C __NAKED__ void vfp_add_f64 (double *d, const double *s1, const double *s2, int n)
1.252 +{
1.253 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.254 + asm("ands ip, r3, #3"); /* ip = n % 3 */
1.255 + asm("beq vfp_add_f64_unroll"); /* if ip == 0 goto prep_loop2 */
1.256 + asm("vfp_add_f64_loop1:");
1.257 +
1.258 + //asm("fldmiad r1!, {d0}");
1.259 + VFP_FLDMIAD(CC_AL,1,0,1);
1.260 +
1.261 + //asm("fldmiad r2!, {d1}");
1.262 + VFP_FLDMIAD(CC_AL,2,1,1);
1.263 +
1.264 + //asm("faddd d2, d0, d1");
1.265 + VFP_FADDD(,2,0,1);
1.266 +
1.267 + //asm("fstmiad r0!, {d2}");
1.268 + VFP_FSTMIAD(CC_AL,0,2,1);
1.269 +
1.270 + asm("subs ip, ip, #1");
1.271 + asm("bne vfp_add_f64_loop1");
1.272 + asm("vfp_add_f64_unroll:"); /* unroll by 4 */
1.273 + asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
1.274 + asm(" beq vfp_add_f64_end"); /* if ip == 0 goto finish */
1.275 +
1.276 + //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */
1.277 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.278 +
1.279 + asm("mov fp, #3");
1.280 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
1.281 +
1.282 + //asm("fmxr fpscr, fp");
1.283 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.284 +
1.285 + asm("vfp_add_f64_loop2:");
1.286 +
1.287 + //asm("fldmiad r1!, {d4, d5, d6, d7}");
1.288 + VFP_FLDMIAS(CC_AL,1,4,4);
1.289 +
1.290 + //asm("fldmiad r2!, {d8, d9, d10, d11}");
1.291 + VFP_FLDMIAS(CC_AL,2,8,4);
1.292 +
1.293 + //asm("faddd d12, d4, d8");
1.294 + VFP_FADDD(,12,4,8);
1.295 +
1.296 + //asm("fstmiad r0!, {d12, d13, d14, d15}");
1.297 + VFP_FSTMIAS(CC_AL,0,12,4);
1.298 +
1.299 + asm("subs ip, ip, #1");
1.300 + asm("bne vfp_add_f64_loop2");
1.301 +
1.302 + //asm("fmxr fpscr, lr"); /* restore original fpscr */
1.303 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.304 +
1.305 + asm("vfp_add_f64_end:");
1.306 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.307 +}
1.308 +
1.309 +
1.310 +
1.311 +
1.312 +EXPORT_C __NAKED__ void vfp_abs_f32_f32_ns(float *d, const float *s, int n)
1.313 + {
1.314 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.315 + asm("ands ip, r2, #7"); /* ip = n % 8 */
1.316 + asm("beq vfp_abs_f32_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */
1.317 + asm("vfp_abs_f32_f32_ns_loop1:");
1.318 +
1.319 + //asm("fldmias r1!, {s0}");
1.320 + VFP_FLDMIAS(CC_AL,1,0,1);
1.321 +
1.322 + //asm("fabss s2, s0");
1.323 + VFP_FABSS(CC_AL,2,0);
1.324 +
1.325 + //asm("fstmias r0!, {s2}");
1.326 + VFP_FSTMIAS(CC_AL,0,2,1);
1.327 +
1.328 + asm("subs ip, ip, #1");
1.329 + asm("bne vfp_abs_f32_f32_ns_loop1");
1.330 + asm("vfp_abs_f32_f32_ns_unroll:"); /* unroll by 8 */
1.331 + asm("movs ip, r2, lsr #3"); /* ip = n / 8 */
1.332 + asm("beq vfp_abs_f32_f32_ns_end"); /* if ip == 0 goto finish */
1.333 +
1.334 + //asm("fmrx lr, fpscr"); /* read fpscr register into arm */
1.335 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.336 +
1.337 + asm("mov fp, #7");
1.338 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
1.339 +
1.340 + //asm("fmxr fpscr, fp");
1.341 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.342 +
1.343 + asm("vfp_abs_f32_f32_ns_loop2:");
1.344 +
1.345 + //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
1.346 + VFP_FLDMIAS(CC_AL,1,8,8);
1.347 +
1.348 + //asm("fabss s24, s8");
1.349 + VFP_FABSS(CC_AL,2,0);
1.350 +
1.351 + //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");
1.352 + VFP_FSTMIAS(CC_AL,0,24,8);
1.353 +
1.354 + asm("subs ip, ip, #1");
1.355 + asm("bne vfp_abs_f32_f32_ns_loop2");
1.356 +
1.357 + //asm("fmxr fpscr, lr"); /* restore original fpscr */
1.358 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.359 +
1.360 + asm("vfp_abs_f32_f32_ns_end:");
1.361 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.362 + }
1.363 +
1.364 +EXPORT_C __NAKED__ void vfp_negative_f32(float *d, const float *s, int n)
1.365 + {
1.366 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.367 + asm("ands ip, r2, #7"); /* ip = n % 8 */
1.368 + asm("beq vfp_negative_f32_unroll"); /* if ip == 0 goto prep_loop2 */
1.369 + asm("vfp_negative_f32_loop1:");
1.370 +
1.371 + //asm("fldmias r1!, {s0}");
1.372 + VFP_FLDMIAS(CC_AL,1,0,1);
1.373 +
1.374 + //asm("fnegs s2, s0");
1.375 + VFP_FNEGS(CC_AL,2,0);
1.376 +
1.377 + //asm("fstmias r0!, {s2}");
1.378 + VFP_FSTMIAS(CC_AL,0,2,1);
1.379 +
1.380 + asm("subs ip, ip, #1");
1.381 + asm("bne vfp_negative_f32_loop1");
1.382 + asm("vfp_negative_f32_unroll:"); /* unroll by 8 */
1.383 + asm("movs ip, r2, lsr #3"); /* ip = n / 8 */
1.384 + asm("beq vfp_negative_f32_end"); /* if ip == 0 goto finish */
1.385 +
1.386 + //asm("fmrx lr, fpscr"); /* read fpscr register into arm */
1.387 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.388 +
1.389 + asm("mov fp, #7");
1.390 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
1.391 +
1.392 + // asm("fmxr fpscr, fp");
1.393 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.394 +
1.395 + asm("vfp_negative_f32_loop2:");
1.396 +
1.397 + //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
1.398 + VFP_FLDMIAS(CC_AL,1,8,8);
1.399 +
1.400 + //asm("fnegs s24, s8");
1.401 + VFP_FNEGS(CC_AL,2,0);
1.402 +
1.403 + //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");
1.404 + VFP_FSTMIAS(CC_AL,0,24,8);
1.405 +
1.406 + asm("subs ip, ip, #1");
1.407 + asm("bne vfp_negative_f32_loop2");
1.408 +
1.409 + //asm("fmxr fpscr, lr"); /* restore original fpscr */
1.410 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.411 +
1.412 + asm("vfp_negative_f32_end:");
1.413 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.414 + }
1.415 +
1.416 +EXPORT_C __NAKED__ void vfp_abs_f64_f64_ns(double *d, const double *s, int n)
1.417 + {
1.418 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.419 + asm("ands ip, r2, #3"); /* ip = n % 3 */
1.420 + asm("beq vfp_abs_f64_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */
1.421 + asm("vfp_abs_f64_f64_ns_loop1:");
1.422 +
1.423 + //asm("fldmiad r1!, {d0}");
1.424 + VFP_FLDMIAD(CC_AL,1,0,1);
1.425 +
1.426 + //asm("fabsd d2, d0");
1.427 + VFP_FABSD(,2,0);
1.428 +
1.429 + //asm("fstmiad r0!, {d2}");
1.430 + VFP_FSTMIAD(CC_AL,0,2,1);
1.431 +
1.432 + asm("subs ip, ip, #1");
1.433 + asm("bne vfp_abs_f64_f64_ns_loop1");
1.434 + asm("vfp_abs_f64_f64_ns_unroll:"); /* unroll by 4 */
1.435 + asm("movs ip, r2, lsr #2"); /* ip = n / 4 */
1.436 + asm("beq vfp_abs_f64_f64_ns_end"); /* if ip == 0 goto finish */
1.437 +
1.438 + //asm("fmrx lr, fpscr"); /* read fpscr register into arm */
1.439 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.440 +
1.441 + asm("mov fp, #3");
1.442 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */
1.443 +
1.444 + //asm("fmxr fpscr, fp");
1.445 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.446 +
1.447 + asm("vfp_abs_f64_f64_ns_loop2:");
1.448 +
1.449 +
1.450 + //asm("fldmiad r1!, {d4, d5, d6, d7}");
1.451 + VFP_FLDMIAD(CC_AL,1,4,4);
1.452 +
1.453 + //asm("fabsd d12, d4");
1.454 + VFP_FABSD(,12,4);
1.455 +
1.456 + //asm("fstmiad r0!, {d12, d13, d14, d15}");
1.457 + VFP_FSTMIAD(CC_AL,0,12,4);
1.458 +
1.459 + asm("subs ip, ip, #1");
1.460 + asm("bne vfp_abs_f64_f64_ns_loop2");
1.461 +
1.462 + // asm("fmxr fpscr, lr"); /* restore original fpscr */
1.463 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.464 +
1.465 + asm("vfp_abs_f64_f64_ns_end:");
1.466 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.467 + }
1.468 +
1.469 +
1.470 +EXPORT_C __NAKED__ void vfp_negative_f64(double *d, const double *s, int n)
1.471 + {
1.472 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.473 + asm("ands ip, r2, #3"); /* ip = n % 3 */
1.474 + asm("beq vfp_negative_f64_unroll"); /* if ip == 0 goto prep_loop2 */
1.475 + asm("vfp_negative_f64_loop1:");
1.476 +
1.477 + //asm("fldmiad r1!, {d0}");
1.478 + VFP_FLDMIAD(CC_AL,1,0,1);
1.479 +
1.480 + //asm("fnegd d2, d0");
1.481 + VFP_FNEGD(,2,0);
1.482 +
1.483 + //asm("fstmiad r0!, {d2}");
1.484 + VFP_FSTMIAD(CC_AL,0,2,1);
1.485 +
1.486 + asm("subs ip, ip, #1");
1.487 + asm("bne vfp_negative_f64_loop1");
1.488 + asm("vfp_negative_f64_unroll:"); /* unroll by 4 */
1.489 + asm("movs ip, r2, lsr #2"); /* ip = n / 4 */
1.490 + asm("beq vfp_negative_f64_end"); /* if ip == 0 goto finish */
1.491 +
1.492 + //asm("fmrx lr, fpscr"); /* read fpscr register into arm */
1.493 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.494 +
1.495 + asm("mov fp, #3");
1.496 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */
1.497 +
1.498 + //asm("fmxr fpscr, fp");
1.499 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.500 +
1.501 + asm("vfp_negative_f64_loop2:");
1.502 +
1.503 + //asm("fldmiad r1!, {d4, d5, d6, d7}");
1.504 + VFP_FLDMIAD(CC_AL,1,4,4);
1.505 +
1.506 + //asm("fnegd d12, d4");
1.507 + VFP_FNEGD(,12,4);
1.508 +
1.509 + //asm("fstmiad r0!, {d12, d13, d14, d15}");
1.510 + VFP_FSTMIAD(CC_AL,0,12,4);
1.511 +
1.512 + asm("subs ip, ip, #1");
1.513 + asm("bne vfp_negative_f64_loop2");
1.514 +
1.515 + //asm("fmxr fpscr, lr"); /* restore original fpscr */
1.516 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.517 +
1.518 + asm("vfp_negative_f64_end:");
1.519 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.520 + }
1.521 +
1.522 +
1.523 +//Rakhi changes
1.524 +EXPORT_C __NAKED__ void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n)
1.525 +{
1.526 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.527 + asm("ands ip, r3, #3"); /* ip = n % 3 */
1.528 + asm("beq vfp_divide_f64_unroll"); /* if ip == 0 goto prep_loop2 */
1.529 + asm("vfp_divide_f64_loop1:");
1.530 +
1.531 + //asm("fldmiad r1!, {d0}");
1.532 + VFP_FLDMIAD(CC_AL,1,0,1);
1.533 +
1.534 + //asm("fldmiad r2!, {d1}");
1.535 + VFP_FLDMIAD(CC_AL,2,1,1);
1.536 +
1.537 + //asm("faddd d2, d0, d1");
1.538 + VFP_FDIVD(,2,0,1);
1.539 +
1.540 + //asm("fstmiad r0!, {d2}");
1.541 + VFP_FSTMIAD(CC_AL,0,2,1);
1.542 +
1.543 + asm("subs ip, ip, #1");
1.544 + asm("bne vfp_divide_f64_loop1");
1.545 + asm("vfp_divide_f64_unroll:"); /* unroll by 4 */
1.546 + asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
1.547 + asm(" beq vfp_divide_f64_end"); /* if ip == 0 goto finish */
1.548 +
1.549 + //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */
1.550 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.551 +
1.552 + asm("mov fp, #3");
1.553 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
1.554 +
1.555 + //asm("fmxr fpscr, fp");
1.556 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.557 +
1.558 + asm("vfp_divide_f64_loop2:");
1.559 +
1.560 + //asm("fldmiad r1!, {d4, d5, d6, d7}");
1.561 + VFP_FLDMIAS(CC_AL,1,4,4);
1.562 +
1.563 + //asm("fldmiad r2!, {d8, d9, d10, d11}");
1.564 + VFP_FLDMIAS(CC_AL,2,8,4);
1.565 +
1.566 + //asm("faddd d12, d4, d8");
1.567 + VFP_FDIVD(,12,4,8);
1.568 +
1.569 + //asm("fstmiad r0!, {d12, d13, d14, d15}");
1.570 + VFP_FSTMIAS(CC_AL,0,12,4);
1.571 +
1.572 + asm("subs ip, ip, #1");
1.573 + asm("bne vfp_divide_f64_loop2");
1.574 +
1.575 + //asm("fmxr fpscr, lr"); /* restore original fpscr */
1.576 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.577 +
1.578 + asm("vfp_divide_f64_end:");
1.579 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.580 +}
1.581 +
1.582 +EXPORT_C __NAKED__ void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n)
1.583 +{
1.584 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.585 + asm("ands ip, r3, #3"); /* ip = n % 3 */
1.586 + asm("beq vfp_multiply_f64_unroll"); /* if ip == 0 goto prep_loop2 */
1.587 + asm("vfp_multiply_f64_loop1:");
1.588 +
1.589 + //asm("fldmiad r1!, {d0}");
1.590 + VFP_FLDMIAD(CC_AL,1,0,1);
1.591 +
1.592 + //asm("fldmiad r2!, {d1}");
1.593 + VFP_FLDMIAD(CC_AL,2,1,1);
1.594 +
1.595 + //asm("faddd d2, d0, d1");
1.596 + VFP_FMULD(,2,0,1);
1.597 +
1.598 + //asm("fstmiad r0!, {d2}");
1.599 + VFP_FSTMIAD(CC_AL,0,2,1);
1.600 +
1.601 + asm("subs ip, ip, #1");
1.602 + asm("bne vfp_multiply_f64_loop1");
1.603 + asm("vfp_multiply_f64_unroll:"); /* unroll by 4 */
1.604 + asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
1.605 + asm(" beq vfp_multiply_f64_end"); /* if ip == 0 goto finish */
1.606 +
1.607 + //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */
1.608 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.609 +
1.610 + asm("mov fp, #3");
1.611 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
1.612 +
1.613 + //asm("fmxr fpscr, fp");
1.614 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.615 +
1.616 + asm("vfp_multiply_f64_loop2:");
1.617 +
1.618 + //asm("fldmiad r1!, {d4, d5, d6, d7}");
1.619 + VFP_FLDMIAS(CC_AL,1,4,4);
1.620 +
1.621 + //asm("fldmiad r2!, {d8, d9, d10, d11}");
1.622 + VFP_FLDMIAS(CC_AL,2,8,4);
1.623 +
1.624 + //asm("faddd d12, d4, d8");
1.625 + VFP_FMULD(,12,4,8);
1.626 +
1.627 + //asm("fstmiad r0!, {d12, d13, d14, d15}");
1.628 + VFP_FSTMIAS(CC_AL,0,12,4);
1.629 +
1.630 + asm("subs ip, ip, #1");
1.631 + asm("bne vfp_multiply_f64_loop2");
1.632 +
1.633 + //asm("fmxr fpscr, lr"); /* restore original fpscr */
1.634 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.635 +
1.636 + asm("vfp_multiply_f64_end:");
1.637 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.638 +}
1.639 +
1.640 +EXPORT_C __NAKED__ void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n)
1.641 +{
1.642 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.643 + asm("ands ip, r3, #3"); /* ip = n % 3 */
1.644 + asm("beq vfp_subtract_f64_unroll"); /* if ip == 0 goto prep_loop2 */
1.645 + asm("vfp_subtract_f64_loop1:");
1.646 +
1.647 + //asm("fldmiad r1!, {d0}");
1.648 + VFP_FLDMIAD(CC_AL,1,0,1);
1.649 +
1.650 + //asm("fldmiad r2!, {d1}");
1.651 + VFP_FLDMIAD(CC_AL,2,1,1);
1.652 +
1.653 + //asm("faddd d2, d0, d1");
1.654 + VFP_FSUBD(,2,0,1);
1.655 +
1.656 + //asm("fstmiad r0!, {d2}");
1.657 + VFP_FSTMIAD(CC_AL,0,2,1);
1.658 +
1.659 + asm("subs ip, ip, #1");
1.660 + asm("bne vfp_subtract_f64_loop1");
1.661 + asm("vfp_subtract_f64_unroll:"); /* unroll by 4 */
1.662 + asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
1.663 + asm(" beq vfp_subtract_f64_end"); /* if ip == 0 goto finish */
1.664 +
1.665 + //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */
1.666 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.667 +
1.668 + asm("mov fp, #3");
1.669 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
1.670 +
1.671 + //asm("fmxr fpscr, fp");
1.672 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.673 +
1.674 + asm("vfp_subtract_f64_loop2:");
1.675 +
1.676 + //asm("fldmiad r1!, {d4, d5, d6, d7}");
1.677 + VFP_FLDMIAS(CC_AL,1,4,4);
1.678 +
1.679 + //asm("fldmiad r2!, {d8, d9, d10, d11}");
1.680 + VFP_FLDMIAS(CC_AL,2,8,4);
1.681 +
1.682 + //asm("faddd d12, d4, d8");
1.683 + VFP_FSUBD(,12,4,8);
1.684 +
1.685 + //asm("fstmiad r0!, {d12, d13, d14, d15}");
1.686 + VFP_FSTMIAS(CC_AL,0,12,4);
1.687 +
1.688 + asm("subs ip, ip, #1");
1.689 + asm("bne vfp_subtract_f64_loop2");
1.690 +
1.691 + //asm("fmxr fpscr, lr"); /* restore original fpscr */
1.692 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.693 +
1.694 + asm("vfp_subtract_f64_end:");
1.695 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.696 +}
1.697 +
1.698 +EXPORT_C __NAKED__ void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n)
1.699 +{
1.700 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.701 +
1.702 + //asm("fldmias r2, {s1}"); /* load scalar value */
1.703 + VFP_FLDMIAS(CC_AL,2,1,1);
1.704 +
1.705 + asm("ands ip, r3, #7"); /* ip = n % 8 */
1.706 + asm("beq vfp_scalaradd_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */
1.707 + asm("vfp_scalaradd_f32_ns_loop1:");
1.708 +
1.709 + //asm("fldmias r1!, {s0}");
1.710 + VFP_FLDMIAS(CC_AL,1,0,1);
1.711 +
1.712 + //asm("FADDS s2, s0, s1");
1.713 + VFP_FADDS(CC_AL,2,0,1);
1.714 +
1.715 + //asm("fstmias r0!, {s2}");
1.716 + VFP_FSTMIAS(CC_AL,0,2,8);
1.717 +
1.718 + asm("subs ip, ip, #1");
1.719 + asm("bne vfp_scalaradd_f32_ns_loop1");
1.720 + asm("vfp_scalaradd_f32_ns_unroll:"); /* unroll by 8 */
1.721 + asm("movs ip, r3, lsr #3"); /* ip = n / 8 */
1.722 + asm("beq vfp_scalaradd_f32_ns_end"); /* if ip == 0 goto finish */
1.723 +
1.724 + //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\
1.725 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.726 +
1.727 + asm("mov fp, #7");
1.728 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
1.729 +
1.730 + //asm("fmxr fpscr, fp");
1.731 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.732 +
1.733 + asm("vfp_scalaradd_f32_ns_loop2:");
1.734 + //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
1.735 + VFP_FLDMIAS(CC_AL,1,8,8);
1.736 +
1.737 + //asm("FADDS s24, s8, s1");
1.738 + VFP_FADDS(CC_AL,24,8,1);
1.739 +
1.740 + //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");
1.741 + VFP_FSTMIAS(CC_AL,0,24,8);
1.742 +
1.743 + asm("subs ip, ip, #1");
1.744 + asm("bne vfp_scalaradd_f32_ns_loop2");
1.745 +
1.746 + //asm("fmxr fpscr, lr"); /* restore original fpscr */
1.747 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.748 +
1.749 + asm("vfp_scalaradd_f32_ns_end:");
1.750 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.751 +}
1.752 +
1.753 +EXPORT_C __NAKED__ void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n)
1.754 +{
1.755 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.756 +
1.757 + //asm("fldmias r2, {s1}"); /* load scalar value */
1.758 + VFP_FLDMIAS(CC_AL,2,1,1);
1.759 +
1.760 + asm("ands ip, r3, #7"); /* ip = n % 8 */
1.761 + asm("beq vfp_scalarmultiply_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */
1.762 + asm("vfp_scalarmultiply_f32_ns_loop1:");
1.763 +
1.764 + //asm("fldmias r1!, {s0}");
1.765 + VFP_FLDMIAS(CC_AL,1,0,1);
1.766 +
1.767 + //asm("FADDS s2, s0, s1");
1.768 + VFP_FMULS(CC_AL,2,0,1);
1.769 +
1.770 + //asm("fstmias r0!, {s2}");
1.771 + VFP_FSTMIAS(CC_AL,0,2,8);
1.772 +
1.773 + asm("subs ip, ip, #1");
1.774 + asm("bne vfp_scalarmultiply_f32_ns_loop1");
1.775 + asm("vfp_scalarmultiply_f32_ns_unroll:"); /* unroll by 8 */
1.776 + asm("movs ip, r3, lsr #3"); /* ip = n / 8 */
1.777 + asm("beq vfp_scalarmultiply_f32_ns_end"); /* if ip == 0 goto finish */
1.778 +
1.779 + //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\
1.780 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.781 +
1.782 + asm("mov fp, #7");
1.783 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */
1.784 +
1.785 + //asm("fmxr fpscr, fp");
1.786 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.787 +
1.788 + asm("vfp_scalarmultiply_f32_ns_loop2:");
1.789 + //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");
1.790 + VFP_FLDMIAS(CC_AL,1,8,8);
1.791 +
1.792 + //asm("FADDS s24, s8, s1");
1.793 + VFP_FMULS(CC_AL,24,8,1);
1.794 +
1.795 + //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");
1.796 + VFP_FSTMIAS(CC_AL,0,24,8);
1.797 +
1.798 + asm("subs ip, ip, #1");
1.799 + asm("bne vfp_scalarmultiply_f32_ns_loop2");
1.800 +
1.801 + //asm("fmxr fpscr, lr"); /* restore original fpscr */
1.802 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.803 +
1.804 + asm("vfp_scalarmultiply_f32_ns_end:");
1.805 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.806 +}
1.807 +
1.808 +EXPORT_C __NAKED__ void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n)
1.809 +{
1.810 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.811 +
1.812 + //asm("fldmiad r2, {d1}"); /* load scalar value */
1.813 + VFP_FLDMIAD(CC_AL,2,1,1);
1.814 +
1.815 + asm("ands ip, r3, #3"); /* ip = n % 3 */
1.816 + asm("beq vfp_scalaradd_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */
1.817 + asm("vfp_scalaradd_f64_ns_loop1:");
1.818 + //asm("fldmiad r1!, {d0}");
1.819 + VFP_FLDMIAD(CC_AL,1,0,1);
1.820 +
1.821 + //asm("VFP_FADDD d2, d0, d1");
1.822 + VFP_FADDD(,2,0,1);
1.823 +
1.824 + //asm("fstmiad r0!, {d2}");
1.825 + VFP_FSTMIAD(CC_AL,0,2,1);
1.826 +
1.827 + asm("subs ip, ip, #1");
1.828 + asm("bne vfp_scalaradd_f64_ns_loop1");
1.829 + asm("vfp_scalaradd_f64_ns_unroll:"); /* unroll by 4 */
1.830 + asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
1.831 + asm("beq vfp_scalaradd_f64_ns_end"); /* if ip == 0 goto finish */
1.832 +
1.833 + //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\
1.834 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.835 +
1.836 + asm("mov fp, #3");
1.837 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */
1.838 +
1.839 + //asm("fmxr fpscr, fp");
1.840 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.841 +
1.842 + asm("vfp_scalaradd_f64_ns_loop2:");
1.843 +
1.844 + //asm("fldmiad r1!, {d4, d5, d6, d7}");
1.845 + VFP_FLDMIAD(CC_AL,1,4,4);
1.846 +
1.847 + //asm("VFP_FADDD d12, d4, d1");
1.848 + VFP_FADDD(,12,4,1);
1.849 +
1.850 + //asm("fstmiad r0!, {d12, d13, d14, d15}");
1.851 + VFP_FSTMIAD(CC_AL,0,12,4);
1.852 +
1.853 + asm("subs ip, ip, #1");
1.854 + asm("bne vfp_scalaradd_f64_ns_loop2");
1.855 +
1.856 + //asm("fmxr fpscr, lr"); /* restore original fpscr */
1.857 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.858 +
1.859 + asm("vfp_scalaradd_f64_ns_end:");
1.860 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.861 +}
1.862 +
1.863 +EXPORT_C __NAKED__ void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n)
1.864 +{
1.865 +
1.866 + asm("stmdb sp!, {fp, lr}"); /* save registers to stack */
1.867 +
1.868 + //asm("fldmiad r2, {d1}"); /* load scalar value */
1.869 + VFP_FLDMIAD(CC_AL,2,1,1);
1.870 +
1.871 + asm("ands ip, r3, #3"); /* ip = n % 3 */
1.872 + asm("beq vfp_scalarmultiply_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */
1.873 + asm("vfp_scalarmultiply_f64_ns_loop1:");
1.874 + //asm("fldmiad r1!, {d0}");
1.875 + VFP_FLDMIAD(CC_AL,1,0,1);
1.876 +
1.877 + //asm("VFP_FADDD d2, d0, d1");
1.878 + VFP_FMULD(,2,0,1);
1.879 +
1.880 + //asm("fstmiad r0!, {d2}");
1.881 + VFP_FSTMIAD(CC_AL,0,2,1);
1.882 +
1.883 + asm("subs ip, ip, #1");
1.884 + asm("bne vfp_scalarmultiply_f64_ns_loop1");
1.885 + asm("vfp_scalarmultiply_f64_ns_unroll:"); /* unroll by 4 */
1.886 + asm("movs ip, r3, lsr #2"); /* ip = n / 4 */
1.887 + asm("beq vfp_scalarmultiply_f64_ns_end"); /* if ip == 0 goto finish */
1.888 +
1.889 + //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\
1.890 + VFP_FMRX(,14,VFP_XREG_FPSCR);
1.891 +
1.892 + asm("mov fp, #3");
1.893 + asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */
1.894 +
1.895 + //asm("fmxr fpscr, fp");
1.896 + VFP_FMXR(,VFP_XREG_FPSCR,11);
1.897 +
1.898 + asm("vfp_scalarmultiply_f64_ns_loop2:");
1.899 +
1.900 + //asm("fldmiad r1!, {d4, d5, d6, d7}");
1.901 + VFP_FLDMIAD(CC_AL,1,4,4);
1.902 +
1.903 + //asm("VFP_FADDD d12, d4, d1");
1.904 + VFP_FMULD(,12,4,1);
1.905 +
1.906 + //asm("fstmiad r0!, {d12, d13, d14, d15}");
1.907 + VFP_FSTMIAD(CC_AL,0,12,4);
1.908 +
1.909 + asm("subs ip, ip, #1");
1.910 + asm("bne vfp_scalarmultiply_f64_ns_loop2");
1.911 +
1.912 + //asm("fmxr fpscr, lr"); /* restore original fpscr */
1.913 + VFP_FMXR(,VFP_XREG_FPSCR,14);
1.914 +
1.915 + asm("vfp_scalarmultiply_f64_ns_end:");
1.916 + asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */
1.917 +
1.918 +}
1.919 +
1.920 +
1.921 +}
1.922 +#endif