os/ossrv/genericopenlibs/liboil/src/arm/math_vfp_asm.cia
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
#if __ARMCC__
sl@0
     2
sl@0
     3
#define __CPU_ARM 
sl@0
     4
#define __CPU_HAS_VFP
sl@0
     5
#include <arm_vfp.h>
sl@0
     6
#include <e32std.h>
sl@0
     7
sl@0
     8
sl@0
     9
extern "C" {
sl@0
    10
sl@0
    11
EXPORT_C __NAKED__ void vfp_add_f32 (float *d, const float *s1, const float *s2, int n)
sl@0
    12
    {
sl@0
    13
    asm(" stmdb sp!, {fp, lr}"); 
sl@0
    14
    asm("ands ip, r3, #7"); 
sl@0
    15
    asm("beq vfp_add_f32_unroll");
sl@0
    16
      
sl@0
    17
   //asm("fldmias r1!, {s0}"); 
sl@0
    18
   VFP_FLDMIAS(CC_AL,1,0,1);
sl@0
    19
   
sl@0
    20
   asm("vfp_add_f32_loop1: ");
sl@0
    21
      
sl@0
    22
   //asm("fldmias r2!, {s1}");    
sl@0
    23
     VFP_FLDMIAS(CC_AL,2,1,1);
sl@0
    24
 
sl@0
    25
    //asm("fadds s2, s0, s1");
sl@0
    26
    VFP_FADDS(CC_AL,2,0,1);
sl@0
    27
      
sl@0
    28
    //asm("fstmias r0!, {s2}");
sl@0
    29
    VFP_FSTMIAS(CC_AL,0,2,1);   
sl@0
    30
    
sl@0
    31
    asm("subs ip, ip, #1"); 
sl@0
    32
    asm("bne vfp_add_f32_loop1 ");
sl@0
    33
	asm("vfp_add_f32_unroll: movs ip, r3, lsr #3"); 
sl@0
    34
    asm("beq vfp_add_f32_end");
sl@0
    35
    
sl@0
    36
    
sl@0
    37
    //asm("fmrx lr, fpscr");  
sl@0
    38
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
    39
    
sl@0
    40
    
sl@0
    41
    asm("mov fp, #7"); 
sl@0
    42
    asm("orr fp, lr, fp, lsl #16"); 
sl@0
    43
    
sl@0
    44
    //asm("fmxr fpscr, fp"); 
sl@0
    45
    VFP_FMXR(,VFP_XREG_FPSCR,11);
sl@0
    46
        
sl@0
    47
      
sl@0
    48
    asm("vfp_add_f32_loop2:");
sl@0
    49
  
sl@0
    50
    //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); 
sl@0
    51
        VFP_FLDMIAS(CC_AL,1,8,8);   
sl@0
    52
 
sl@0
    53
    //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
sl@0
    54
    VFP_FLDMIAS(CC_AL,2,16,8);
sl@0
    55
   
sl@0
    56
    //asm("fadds s24, s8, s16"); 
sl@0
    57
        VFP_FADDS(CC_AL,24,8,16);      
sl@0
    58
   
sl@0
    59
    asm("subs ip, ip, #1"); 
sl@0
    60
    asm("bne vfp_add_f32_loop2"); 
sl@0
    61
  
sl@0
    62
    //asm("fmxr fpscr, lr"); 
sl@0
    63
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
    64
      
sl@0
    65
   asm("vfp_add_f32_end:");
sl@0
    66
   asm ("ldmia sp!, {fp, pc}");
sl@0
    67
    
sl@0
    68
    }
sl@0
    69
sl@0
    70
sl@0
    71
EXPORT_C __NAKED__ void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n)
sl@0
    72
    {
sl@0
    73
    asm(" stmdb sp!, {fp, lr}"); 
sl@0
    74
    asm("ands ip, r3, #7"); 
sl@0
    75
    asm("beq vfp_divide_f32_unroll");
sl@0
    76
      
sl@0
    77
   //asm("fldmias r1!, {s0}"); 
sl@0
    78
   VFP_FLDMIAS(CC_AL,1,0,1);
sl@0
    79
   
sl@0
    80
   asm("vfp_divide_f32_loop1:");
sl@0
    81
      
sl@0
    82
   //asm("fldmias r2!, {s1}");    
sl@0
    83
     VFP_FLDMIAS(CC_AL,2,1,1);
sl@0
    84
 
sl@0
    85
    //asm("fadds s2, s0, s1");
sl@0
    86
    VFP_FDIVS(CC_AL,2,0,1);
sl@0
    87
      
sl@0
    88
    //asm("fstmias r0!, {s2}");
sl@0
    89
    VFP_FSTMIAS(CC_AL,0,2,1);   
sl@0
    90
    
sl@0
    91
    asm("subs ip, ip, #1"); 
sl@0
    92
    asm("bne vfp_divide_f32_loop1");
sl@0
    93
    asm("vfp_divide_f32_unroll: movs ip, r3, lsr #3"); 
sl@0
    94
    asm("beq vfp_divide_f32_end");
sl@0
    95
    
sl@0
    96
    
sl@0
    97
    //asm("fmrx lr, fpscr");  
sl@0
    98
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
    99
    
sl@0
   100
    
sl@0
   101
    asm("mov fp, #7"); 
sl@0
   102
    asm("orr fp, lr, fp, lsl #16"); 
sl@0
   103
    
sl@0
   104
    //asm("fmxr fpscr, fp"); 
sl@0
   105
    VFP_FMXR(,VFP_XREG_FPSCR,11);
sl@0
   106
        
sl@0
   107
      
sl@0
   108
    asm("vfp_divide_f32_loop2:");
sl@0
   109
  
sl@0
   110
    //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); 
sl@0
   111
        VFP_FLDMIAS(CC_AL,1,8,8);   
sl@0
   112
 
sl@0
   113
    //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
sl@0
   114
    VFP_FLDMIAS(CC_AL,2,16,8);
sl@0
   115
   
sl@0
   116
    //asm("fadds s24, s8, s16"); 
sl@0
   117
        VFP_FDIVS(CC_AL,24,8,16);      
sl@0
   118
   
sl@0
   119
    asm("subs ip, ip, #1"); 
sl@0
   120
    asm("bne vfp_divide_f32_loop2"); 
sl@0
   121
  
sl@0
   122
    //asm("fmxr fpscr, lr"); 
sl@0
   123
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   124
      
sl@0
   125
   asm("vfp_divide_f32_end:");
sl@0
   126
   asm ("ldmia sp!, {fp, pc}");
sl@0
   127
    
sl@0
   128
    }
sl@0
   129
sl@0
   130
EXPORT_C __NAKED__ void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n)
sl@0
   131
    {
sl@0
   132
    asm(" stmdb sp!, {fp, lr}"); 
sl@0
   133
    asm("ands ip, r3, #7"); 
sl@0
   134
    asm("beq vfp_multiply_f32_unroll");
sl@0
   135
      
sl@0
   136
   //asm("fldmias r1!, {s0}"); 
sl@0
   137
   VFP_FLDMIAS(CC_AL,1,0,1);
sl@0
   138
   
sl@0
   139
   asm("vfp_multiply_f32_loop1:");
sl@0
   140
      
sl@0
   141
   //asm("fldmias r2!, {s1}");    
sl@0
   142
     VFP_FLDMIAS(CC_AL,2,1,1);
sl@0
   143
 
sl@0
   144
    //asm("fadds s2, s0, s1");
sl@0
   145
    VFP_FMULS(CC_AL,2,0,1);
sl@0
   146
      
sl@0
   147
    //asm("fstmias r0!, {s2}");
sl@0
   148
    VFP_FSTMIAS(CC_AL,0,2,1);   
sl@0
   149
    
sl@0
   150
    asm("subs ip, ip, #1"); 
sl@0
   151
    asm("bne vfp_multiply_f32_loop1");
sl@0
   152
    asm("vfp_multiply_f32_unroll: movs ip, r3, lsr #3"); 
sl@0
   153
    asm("beq vfp_multiply_f32_end");
sl@0
   154
    
sl@0
   155
    
sl@0
   156
    //asm("fmrx lr, fpscr");  
sl@0
   157
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   158
    
sl@0
   159
    
sl@0
   160
    asm("mov fp, #7"); 
sl@0
   161
    asm("orr fp, lr, fp, lsl #16"); 
sl@0
   162
    
sl@0
   163
    //asm("fmxr fpscr, fp"); 
sl@0
   164
    VFP_FMXR(,VFP_XREG_FPSCR,11);
sl@0
   165
        
sl@0
   166
      
sl@0
   167
    asm("vfp_multiply_f32_loop2:");
sl@0
   168
  
sl@0
   169
    //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); 
sl@0
   170
        VFP_FLDMIAS(CC_AL,1,8,8);   
sl@0
   171
 
sl@0
   172
    //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
sl@0
   173
    VFP_FLDMIAS(CC_AL,2,16,8);
sl@0
   174
   
sl@0
   175
    //asm("fadds s24, s8, s16"); 
sl@0
   176
        VFP_FMULS(CC_AL,24,8,16);      
sl@0
   177
   
sl@0
   178
    asm("subs ip, ip, #1"); 
sl@0
   179
    asm("bne vfp_multiply_f32_loop2"); 
sl@0
   180
  
sl@0
   181
    //asm("fmxr fpscr, lr"); 
sl@0
   182
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   183
      
sl@0
   184
   asm("vfp_multiply_f32_end:");
sl@0
   185
   asm ("ldmia sp!, {fp, pc}");
sl@0
   186
    
sl@0
   187
    }
sl@0
   188
sl@0
   189
EXPORT_C __NAKED__ void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n)
sl@0
   190
    {
sl@0
   191
    asm(" stmdb sp!, {fp, lr}"); 
sl@0
   192
    asm("ands ip, r3, #7"); 
sl@0
   193
    asm("beq vfp_subtract_f32_unroll");
sl@0
   194
      
sl@0
   195
   //asm("fldmias r1!, {s0}"); 
sl@0
   196
   VFP_FLDMIAS(CC_AL,1,0,1);
sl@0
   197
   
sl@0
   198
   asm("vfp_subtract_f32_loop1:");
sl@0
   199
      
sl@0
   200
   //asm("fldmias r2!, {s1}");    
sl@0
   201
     VFP_FLDMIAS(CC_AL,2,1,1);
sl@0
   202
 
sl@0
   203
    //asm("fadds s2, s0, s1");
sl@0
   204
    VFP_FSUBS(CC_AL,2,0,1);
sl@0
   205
      
sl@0
   206
    //asm("fstmias r0!, {s2}");
sl@0
   207
    VFP_FSTMIAS(CC_AL,0,2,1);   
sl@0
   208
    
sl@0
   209
    asm("subs ip, ip, #1"); 
sl@0
   210
    asm("bne vfp_subtract_f32_loop1");
sl@0
   211
    asm("vfp_subtract_f32_unroll: movs ip, r3, lsr #3"); 
sl@0
   212
    asm("beq vfp_subtract_f32_end");
sl@0
   213
    
sl@0
   214
    
sl@0
   215
    //asm("fmrx lr, fpscr");  
sl@0
   216
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   217
    
sl@0
   218
    
sl@0
   219
    asm("mov fp, #7"); 
sl@0
   220
    asm("orr fp, lr, fp, lsl #16"); 
sl@0
   221
    
sl@0
   222
    //asm("fmxr fpscr, fp"); 
sl@0
   223
    VFP_FMXR(,VFP_XREG_FPSCR,11);
sl@0
   224
        
sl@0
   225
      
sl@0
   226
    asm("vfp_subtract_f32_loop2:");
sl@0
   227
  
sl@0
   228
    //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); 
sl@0
   229
        VFP_FLDMIAS(CC_AL,1,8,8);   
sl@0
   230
 
sl@0
   231
    //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}");
sl@0
   232
    VFP_FLDMIAS(CC_AL,2,16,8);
sl@0
   233
   
sl@0
   234
    //asm("fadds s24, s8, s16"); 
sl@0
   235
        VFP_FSUBS(CC_AL,24,8,16);      
sl@0
   236
   
sl@0
   237
    asm("subs ip, ip, #1"); 
sl@0
   238
    asm("bne vfp_subtract_f32_loop2"); 
sl@0
   239
  
sl@0
   240
    //asm("fmxr fpscr, lr"); 
sl@0
   241
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   242
      
sl@0
   243
   asm("vfp_subtract_f32_end:");
sl@0
   244
   asm ("ldmia sp!, {fp, pc}");
sl@0
   245
    
sl@0
   246
    }
sl@0
   247
sl@0
   248
EXPORT_C __NAKED__ void vfp_add_f64 (double *d, const double *s1, const double *s2, int n)
sl@0
   249
{
sl@0
   250
    asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */     
sl@0
   251
    asm("ands          ip, r3, #3");               /* ip = n % 3 */                 
sl@0
   252
    asm("beq           vfp_add_f64_unroll"); /* if ip == 0 goto prep_loop2 */ 
sl@0
   253
    asm("vfp_add_f64_loop1:");                                                   
sl@0
   254
    
sl@0
   255
    //asm("fldmiad       r1!, {d0}");   
sl@0
   256
    VFP_FLDMIAD(CC_AL,1,0,1);
sl@0
   257
                                                  
sl@0
   258
    //asm("fldmiad       r2!, {d1}");       
sl@0
   259
    VFP_FLDMIAD(CC_AL,2,1,1);         
sl@0
   260
                                         
sl@0
   261
    //asm("faddd  d2, d0, d1");       
sl@0
   262
    VFP_FADDD(,2,0,1);
sl@0
   263
                                             
sl@0
   264
    //asm("fstmiad       r0!, {d2}");     
sl@0
   265
    VFP_FSTMIAD(CC_AL,0,2,1);                                                     
sl@0
   266
                                      
sl@0
   267
    asm("subs          ip, ip, #1");                                                
sl@0
   268
    asm("bne           vfp_add_f64_loop1");                                   
sl@0
   269
    asm("vfp_add_f64_unroll:");                  /* unroll by 4 */                
sl@0
   270
    asm("movs          ip, r3, lsr #2");           /* ip = n / 4 */                 
sl@0
   271
    asm("  beq           vfp_add_f64_end");    /* if ip == 0 goto finish */     
sl@0
   272
    
sl@0
   273
    //asm("  fmrx          lr, fpscr");                /* read fpscr register into arm */
sl@0
   274
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   275
    
sl@0
   276
    asm("mov           fp, #3");                                                    
sl@0
   277
    asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 8 */     
sl@0
   278
    
sl@0
   279
    //asm("fmxr          fpscr, fp");      
sl@0
   280
    VFP_FMXR(,VFP_XREG_FPSCR,11);    
sl@0
   281
                                               
sl@0
   282
    asm("vfp_add_f64_loop2:");                                                    
sl@0
   283
    
sl@0
   284
    //asm("fldmiad       r1!, {d4, d5, d6, d7}");    
sl@0
   285
    VFP_FLDMIAS(CC_AL,1,4,4);                                 
sl@0
   286
sl@0
   287
    //asm("fldmiad       r2!, {d8, d9, d10, d11}");                                    
sl@0
   288
    VFP_FLDMIAS(CC_AL,2,8,4);                                 
sl@0
   289
    
sl@0
   290
    //asm("faddd  d12, d4, d8");                                                
sl@0
   291
    VFP_FADDD(,12,4,8);
sl@0
   292
    
sl@0
   293
    //asm("fstmiad       r0!, {d12, d13, d14, d15}");                                  
sl@0
   294
    VFP_FSTMIAS(CC_AL,0,12,4);                                 
sl@0
   295
    
sl@0
   296
    asm("subs          ip, ip, #1");                                                
sl@0
   297
    asm("bne           vfp_add_f64_loop2");                                   
sl@0
   298
    
sl@0
   299
    //asm("fmxr          fpscr, lr");                /* restore original fpscr */      
sl@0
   300
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   301
                                    
sl@0
   302
    asm("vfp_add_f64_end:");                                                      
sl@0
   303
    asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */   
sl@0
   304
}     
sl@0
   305
  
sl@0
   306
  
sl@0
   307
sl@0
   308
  
sl@0
   309
EXPORT_C __NAKED__  void vfp_abs_f32_f32_ns(float *d, const float *s, int n) 
sl@0
   310
    {                                                         
sl@0
   311
    asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */    
sl@0
   312
    asm("ands          ip, r2, #7");               /* ip = n % 8 */                 
sl@0
   313
    asm("beq           vfp_abs_f32_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */ 
sl@0
   314
  	asm("vfp_abs_f32_f32_ns_loop1:");                                                   
sl@0
   315
   
sl@0
   316
    //asm("fldmias       r1!, {s0}");  
sl@0
   317
    VFP_FLDMIAS(CC_AL,1,0,1);
sl@0
   318
                                                   
sl@0
   319
    //asm("fabss  s2, s0");  
sl@0
   320
    VFP_FABSS(CC_AL,2,0);
sl@0
   321
                                                      
sl@0
   322
    //asm("fstmias       r0!, {s2}");                                                 
sl@0
   323
    VFP_FSTMIAS(CC_AL,0,2,1);   
sl@0
   324
   
sl@0
   325
    asm("subs          ip, ip, #1");                                                
sl@0
   326
    asm("bne           vfp_abs_f32_f32_ns_loop1");                                   
sl@0
   327
  	asm("vfp_abs_f32_f32_ns_unroll:");                 /* unroll by 8 */                
sl@0
   328
    asm("movs          ip, r2, lsr #3");           /* ip = n / 8 */                 
sl@0
   329
    asm("beq           vfp_abs_f32_f32_ns_end");    /* if ip == 0 goto finish */     
sl@0
   330
   
sl@0
   331
    //asm("fmrx          lr, fpscr");                /* read fpscr register into arm */
sl@0
   332
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   333
   
sl@0
   334
    asm("mov           fp, #7");                                                    
sl@0
   335
    asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 8 */     
sl@0
   336
  
sl@0
   337
    //asm("fmxr          fpscr, fp");                                                 
sl@0
   338
  	VFP_FMXR(,VFP_XREG_FPSCR,11); 
sl@0
   339
  
sl@0
   340
  	asm("vfp_abs_f32_f32_ns_loop2:");                                                   
sl@0
   341
   
sl@0
   342
    //asm("fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); 
sl@0
   343
    VFP_FLDMIAS(CC_AL,1,8,8);
sl@0
   344
                   
sl@0
   345
    //asm("fabss  s24, s8");                                                   
sl@0
   346
    VFP_FABSS(CC_AL,2,0);
sl@0
   347
   
sl@0
   348
    //asm("fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");             
sl@0
   349
    VFP_FSTMIAS(CC_AL,0,24,8);
sl@0
   350
    
sl@0
   351
    asm("subs          ip, ip, #1");                                                
sl@0
   352
    asm("bne           vfp_abs_f32_f32_ns_loop2");                                   
sl@0
   353
    
sl@0
   354
    //asm("fmxr          fpscr, lr");                /* restore original fpscr */      
sl@0
   355
  	VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   356
  	 
sl@0
   357
  	asm("vfp_abs_f32_f32_ns_end:");                                                      
sl@0
   358
    asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */ 
sl@0
   359
	} 
sl@0
   360
	
sl@0
   361
EXPORT_C __NAKED__  void vfp_negative_f32(float *d, const float *s, int n)
sl@0
   362
    {                                                         
sl@0
   363
    asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */    
sl@0
   364
    asm("ands          ip, r2, #7");               /* ip = n % 8 */                 
sl@0
   365
    asm("beq           vfp_negative_f32_unroll"); /* if ip == 0 goto prep_loop2 */ 
sl@0
   366
  	asm("vfp_negative_f32_loop1:");                                                   
sl@0
   367
    
sl@0
   368
    //asm("fldmias       r1!, {s0}"); 
sl@0
   369
    VFP_FLDMIAS(CC_AL,1,0,1);
sl@0
   370
                                                    
sl@0
   371
    //asm("fnegs  s2, s0");                                                    
sl@0
   372
    VFP_FNEGS(CC_AL,2,0);
sl@0
   373
     
sl@0
   374
    //asm("fstmias       r0!, {s2}");                                                 
sl@0
   375
    VFP_FSTMIAS(CC_AL,0,2,1); 
sl@0
   376
    
sl@0
   377
    asm("subs          ip, ip, #1");                                                
sl@0
   378
    asm("bne           vfp_negative_f32_loop1");                                   
sl@0
   379
  	asm("vfp_negative_f32_unroll:");                 /* unroll by 8 */                
sl@0
   380
    asm("movs          ip, r2, lsr #3");           /* ip = n / 8 */                 
sl@0
   381
    asm("beq           vfp_negative_f32_end");    /* if ip == 0 goto finish */     
sl@0
   382
   
sl@0
   383
    //asm("fmrx          lr, fpscr");                /* read fpscr register into arm */
sl@0
   384
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   385
   
sl@0
   386
    asm("mov           fp, #7");                                                    
sl@0
   387
    asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 8 */     
sl@0
   388
   
sl@0
   389
   // asm("fmxr          fpscr, fp");                                                 
sl@0
   390
  	VFP_FMXR(,VFP_XREG_FPSCR,11); 
sl@0
   391
  	
sl@0
   392
  	asm("vfp_negative_f32_loop2:");                                                   
sl@0
   393
    
sl@0
   394
    //asm("fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); 
sl@0
   395
    VFP_FLDMIAS(CC_AL,1,8,8);
sl@0
   396
                   
sl@0
   397
    //asm("fnegs  s24, s8");                                                   
sl@0
   398
    VFP_FNEGS(CC_AL,2,0);
sl@0
   399
     
sl@0
   400
    //asm("fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");   
sl@0
   401
    VFP_FSTMIAS(CC_AL,0,24,8);
sl@0
   402
              
sl@0
   403
    asm("subs          ip, ip, #1");                                                
sl@0
   404
    asm("bne           vfp_negative_f32_loop2");           
sl@0
   405
                            
sl@0
   406
    //asm("fmxr          fpscr, lr");                /* restore original fpscr */      
sl@0
   407
  	VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   408
  	
sl@0
   409
  	asm("vfp_negative_f32_end:");                                                      
sl@0
   410
    asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */ 
sl@0
   411
	} 
sl@0
   412
		
sl@0
   413
EXPORT_C __NAKED__ 	void vfp_abs_f64_f64_ns(double *d, const double *s, int n)
sl@0
   414
	{                                                       
sl@0
   415
   asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */    
sl@0
   416
   asm("ands          ip, r2, #3");               /* ip = n % 3 */                 
sl@0
   417
   asm("beq           vfp_abs_f64_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */ 
sl@0
   418
   asm("vfp_abs_f64_f64_ns_loop1:");                                                   
sl@0
   419
sl@0
   420
   //asm("fldmiad       r1!, {d0}"); 
sl@0
   421
   VFP_FLDMIAD(CC_AL,1,0,1);
sl@0
   422
                                                   
sl@0
   423
   //asm("fabsd  d2, d0"); 
sl@0
   424
   VFP_FABSD(,2,0);
sl@0
   425
                                                      
sl@0
   426
   //asm("fstmiad       r0!, {d2}");                                                 
sl@0
   427
   VFP_FSTMIAD(CC_AL,0,2,1);  
sl@0
   428
    
sl@0
   429
   asm("subs          ip, ip, #1");                                                
sl@0
   430
   asm("bne           vfp_abs_f64_f64_ns_loop1");                                   
sl@0
   431
   asm("vfp_abs_f64_f64_ns_unroll:");                 /* unroll by 4 */                
sl@0
   432
   asm("movs          ip, r2, lsr #2");           /* ip = n / 4 */                 
sl@0
   433
   asm("beq           vfp_abs_f64_f64_ns_end");    /* if ip == 0 goto finish */     
sl@0
   434
   
sl@0
   435
   //asm("fmrx          lr, fpscr");                /* read fpscr register into arm */
sl@0
   436
 	 VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   437
   
sl@0
   438
   asm("mov           fp, #3");                                                    
sl@0
   439
   asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 4 */     
sl@0
   440
   
sl@0
   441
   //asm("fmxr          fpscr, fp");                                                 
sl@0
   442
   VFP_FMXR(,VFP_XREG_FPSCR,11); 
sl@0
   443
   
sl@0
   444
   asm("vfp_abs_f64_f64_ns_loop2:");                                                   
sl@0
   445
                   
sl@0
   446
                                                     
sl@0
   447
   //asm("fldmiad       r1!, {d4, d5, d6, d7}");                                     
sl@0
   448
   VFP_FLDMIAD(CC_AL,1,4,4);
sl@0
   449
   
sl@0
   450
   //asm("fabsd  d12, d4");   
sl@0
   451
   VFP_FABSD(,12,4);
sl@0
   452
                                                   
sl@0
   453
   //asm("fstmiad       r0!, {d12, d13, d14, d15}");                                 
sl@0
   454
   VFP_FSTMIAD(CC_AL,0,12,4);
sl@0
   455
   
sl@0
   456
   asm("subs          ip, ip, #1");                                                
sl@0
   457
   asm("bne           vfp_abs_f64_f64_ns_loop2");                                   
sl@0
   458
   
sl@0
   459
  // asm("fmxr          fpscr, lr");                /* restore original fpscr */     
sl@0
   460
   	VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   461
   	
sl@0
   462
   asm("vfp_abs_f64_f64_ns_end:");                                                     
sl@0
   463
   asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */   
sl@0
   464
	}
sl@0
   465
	
sl@0
   466
	
sl@0
   467
EXPORT_C __NAKED__ 	void vfp_negative_f64(double *d, const double *s, int n)
sl@0
   468
	{                                                       
sl@0
   469
   asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */    
sl@0
   470
   asm("ands          ip, r2, #3");               /* ip = n % 3 */                 
sl@0
   471
   asm("beq           vfp_negative_f64_unroll"); /* if ip == 0 goto prep_loop2 */ 
sl@0
   472
   asm("vfp_negative_f64_loop1:");                                                   
sl@0
   473
   
sl@0
   474
   //asm("fldmiad       r1!, {d0}");                                                 
sl@0
   475
   VFP_FLDMIAD(CC_AL,1,0,1);
sl@0
   476
   
sl@0
   477
   //asm("fnegd  d2, d0");                                                    
sl@0
   478
   VFP_FNEGD(,2,0);
sl@0
   479
   
sl@0
   480
   //asm("fstmiad       r0!, {d2}");                                                 
sl@0
   481
   VFP_FSTMIAD(CC_AL,0,2,1);
sl@0
   482
   
sl@0
   483
   asm("subs          ip, ip, #1");                                                
sl@0
   484
   asm("bne           vfp_negative_f64_loop1");                                   
sl@0
   485
   asm("vfp_negative_f64_unroll:");                 /* unroll by 4 */                
sl@0
   486
   asm("movs          ip, r2, lsr #2");           /* ip = n / 4 */                 
sl@0
   487
   asm("beq           vfp_negative_f64_end");    /* if ip == 0 goto finish */     
sl@0
   488
   
sl@0
   489
   //asm("fmrx          lr, fpscr");                /* read fpscr register into arm */
sl@0
   490
   VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   491
   
sl@0
   492
   asm("mov           fp, #3");                                                    
sl@0
   493
   asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 4 */     
sl@0
   494
   
sl@0
   495
   //asm("fmxr          fpscr, fp");                                                 
sl@0
   496
   VFP_FMXR(,VFP_XREG_FPSCR,11); 
sl@0
   497
  
sl@0
   498
   asm("vfp_negative_f64_loop2:");           
sl@0
   499
     
sl@0
   500
   //asm("fldmiad       r1!, {d4, d5, d6, d7}");   
sl@0
   501
   VFP_FLDMIAD(CC_AL,1,4,4);
sl@0
   502
                                      
sl@0
   503
   //asm("fnegd  d12, d4"); 
sl@0
   504
   VFP_FNEGD(,12,4);
sl@0
   505
                                                       
sl@0
   506
   //asm("fstmiad       r0!, {d12, d13, d14, d15}");                                 
sl@0
   507
   VFP_FSTMIAD(CC_AL,0,12,4);
sl@0
   508
    
sl@0
   509
   asm("subs          ip, ip, #1");                                                
sl@0
   510
   asm("bne           vfp_negative_f64_loop2");                                   
sl@0
   511
   
sl@0
   512
   //asm("fmxr          fpscr, lr");                /* restore original fpscr */     
sl@0
   513
   VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   514
   	
sl@0
   515
   asm("vfp_negative_f64_end:");                                                     
sl@0
   516
   asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */   
sl@0
   517
	}
sl@0
   518
		
sl@0
   519
		
sl@0
   520
//Rakhi changes
sl@0
   521
EXPORT_C __NAKED__ void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n)
sl@0
   522
{
sl@0
   523
    asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */     
sl@0
   524
    asm("ands          ip, r3, #3");               /* ip = n % 3 */                 
sl@0
   525
    asm("beq           vfp_divide_f64_unroll"); /* if ip == 0 goto prep_loop2 */ 
sl@0
   526
    asm("vfp_divide_f64_loop1:");                                                   
sl@0
   527
    
sl@0
   528
    //asm("fldmiad       r1!, {d0}");   
sl@0
   529
    VFP_FLDMIAD(CC_AL,1,0,1);
sl@0
   530
                                                  
sl@0
   531
    //asm("fldmiad       r2!, {d1}");       
sl@0
   532
    VFP_FLDMIAD(CC_AL,2,1,1);         
sl@0
   533
                                         
sl@0
   534
    //asm("faddd  d2, d0, d1");       
sl@0
   535
    VFP_FDIVD(,2,0,1);
sl@0
   536
                                             
sl@0
   537
    //asm("fstmiad       r0!, {d2}");     
sl@0
   538
    VFP_FSTMIAD(CC_AL,0,2,1);                                                     
sl@0
   539
                                      
sl@0
   540
    asm("subs          ip, ip, #1");                                                
sl@0
   541
    asm("bne           vfp_divide_f64_loop1");                                   
sl@0
   542
    asm("vfp_divide_f64_unroll:");                  /* unroll by 4 */                
sl@0
   543
    asm("movs          ip, r3, lsr #2");           /* ip = n / 4 */                 
sl@0
   544
    asm("  beq           vfp_divide_f64_end");    /* if ip == 0 goto finish */     
sl@0
   545
    
sl@0
   546
    //asm("  fmrx          lr, fpscr");                /* read fpscr register into arm */
sl@0
   547
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   548
    
sl@0
   549
    asm("mov           fp, #3");                                                    
sl@0
   550
    asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 8 */     
sl@0
   551
    
sl@0
   552
    //asm("fmxr          fpscr, fp");      
sl@0
   553
    VFP_FMXR(,VFP_XREG_FPSCR,11);    
sl@0
   554
                                               
sl@0
   555
    asm("vfp_divide_f64_loop2:");                                                    
sl@0
   556
    
sl@0
   557
    //asm("fldmiad       r1!, {d4, d5, d6, d7}");    
sl@0
   558
    VFP_FLDMIAS(CC_AL,1,4,4);                                 
sl@0
   559
sl@0
   560
    //asm("fldmiad       r2!, {d8, d9, d10, d11}");                                    
sl@0
   561
    VFP_FLDMIAS(CC_AL,2,8,4);                                 
sl@0
   562
    
sl@0
   563
    //asm("faddd  d12, d4, d8");                                                
sl@0
   564
    VFP_FDIVD(,12,4,8);
sl@0
   565
    
sl@0
   566
    //asm("fstmiad       r0!, {d12, d13, d14, d15}");                                  
sl@0
   567
    VFP_FSTMIAS(CC_AL,0,12,4);                                 
sl@0
   568
    
sl@0
   569
    asm("subs          ip, ip, #1");                                                
sl@0
   570
    asm("bne           vfp_divide_f64_loop2");                                   
sl@0
   571
    
sl@0
   572
    //asm("fmxr          fpscr, lr");                /* restore original fpscr */      
sl@0
   573
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   574
                                    
sl@0
   575
    asm("vfp_divide_f64_end:");                                                      
sl@0
   576
    asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */   
sl@0
   577
}     
sl@0
   578
sl@0
   579
EXPORT_C __NAKED__ void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n)
sl@0
   580
{
sl@0
   581
    asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */     
sl@0
   582
    asm("ands          ip, r3, #3");               /* ip = n % 3 */                 
sl@0
   583
    asm("beq           vfp_multiply_f64_unroll"); /* if ip == 0 goto prep_loop2 */ 
sl@0
   584
    asm("vfp_multiply_f64_loop1:");                                                   
sl@0
   585
    
sl@0
   586
    //asm("fldmiad       r1!, {d0}");   
sl@0
   587
    VFP_FLDMIAD(CC_AL,1,0,1);
sl@0
   588
                                                  
sl@0
   589
    //asm("fldmiad       r2!, {d1}");       
sl@0
   590
    VFP_FLDMIAD(CC_AL,2,1,1);         
sl@0
   591
                                         
sl@0
   592
    //asm("faddd  d2, d0, d1");       
sl@0
   593
    VFP_FMULD(,2,0,1);
sl@0
   594
                                             
sl@0
   595
    //asm("fstmiad       r0!, {d2}");     
sl@0
   596
    VFP_FSTMIAD(CC_AL,0,2,1);                                                     
sl@0
   597
                                      
sl@0
   598
    asm("subs          ip, ip, #1");                                                
sl@0
   599
    asm("bne           vfp_multiply_f64_loop1");                                   
sl@0
   600
    asm("vfp_multiply_f64_unroll:");                  /* unroll by 4 */                
sl@0
   601
    asm("movs          ip, r3, lsr #2");           /* ip = n / 4 */                 
sl@0
   602
    asm("  beq           vfp_multiply_f64_end");    /* if ip == 0 goto finish */     
sl@0
   603
    
sl@0
   604
    //asm("  fmrx          lr, fpscr");                /* read fpscr register into arm */
sl@0
   605
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   606
    
sl@0
   607
    asm("mov           fp, #3");                                                    
sl@0
   608
    asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 8 */     
sl@0
   609
    
sl@0
   610
    //asm("fmxr          fpscr, fp");      
sl@0
   611
    VFP_FMXR(,VFP_XREG_FPSCR,11);    
sl@0
   612
                                               
sl@0
   613
    asm("vfp_multiply_f64_loop2:");                                                    
sl@0
   614
    
sl@0
   615
    //asm("fldmiad       r1!, {d4, d5, d6, d7}");    
sl@0
   616
    VFP_FLDMIAS(CC_AL,1,4,4);                                 
sl@0
   617
sl@0
   618
    //asm("fldmiad       r2!, {d8, d9, d10, d11}");                                    
sl@0
   619
    VFP_FLDMIAS(CC_AL,2,8,4);                                 
sl@0
   620
    
sl@0
   621
    //asm("faddd  d12, d4, d8");                                                
sl@0
   622
    VFP_FMULD(,12,4,8);
sl@0
   623
    
sl@0
   624
    //asm("fstmiad       r0!, {d12, d13, d14, d15}");                                  
sl@0
   625
    VFP_FSTMIAS(CC_AL,0,12,4);                                 
sl@0
   626
    
sl@0
   627
    asm("subs          ip, ip, #1");                                                
sl@0
   628
    asm("bne           vfp_multiply_f64_loop2");                                   
sl@0
   629
    
sl@0
   630
    //asm("fmxr          fpscr, lr");                /* restore original fpscr */      
sl@0
   631
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   632
                                    
sl@0
   633
    asm("vfp_multiply_f64_end:");                                                      
sl@0
   634
    asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */   
sl@0
   635
}   
sl@0
   636
sl@0
   637
EXPORT_C __NAKED__ void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n)
sl@0
   638
{
sl@0
   639
    asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */     
sl@0
   640
    asm("ands          ip, r3, #3");               /* ip = n % 3 */                 
sl@0
   641
    asm("beq           vfp_subtract_f64_unroll"); /* if ip == 0 goto prep_loop2 */ 
sl@0
   642
    asm("vfp_subtract_f64_loop1:");                                                   
sl@0
   643
    
sl@0
   644
    //asm("fldmiad       r1!, {d0}");   
sl@0
   645
    VFP_FLDMIAD(CC_AL,1,0,1);
sl@0
   646
                                                  
sl@0
   647
    //asm("fldmiad       r2!, {d1}");       
sl@0
   648
    VFP_FLDMIAD(CC_AL,2,1,1);         
sl@0
   649
                                         
sl@0
   650
    //asm("faddd  d2, d0, d1");       
sl@0
   651
    VFP_FSUBD(,2,0,1);
sl@0
   652
                                             
sl@0
   653
    //asm("fstmiad       r0!, {d2}");     
sl@0
   654
    VFP_FSTMIAD(CC_AL,0,2,1);                                                     
sl@0
   655
                                      
sl@0
   656
    asm("subs          ip, ip, #1");                                                
sl@0
   657
    asm("bne           vfp_subtract_f64_loop1");                                   
sl@0
   658
    asm("vfp_subtract_f64_unroll:");                  /* unroll by 4 */                
sl@0
   659
    asm("movs          ip, r3, lsr #2");           /* ip = n / 4 */                 
sl@0
   660
    asm("  beq           vfp_subtract_f64_end");    /* if ip == 0 goto finish */     
sl@0
   661
    
sl@0
   662
    //asm("  fmrx          lr, fpscr");                /* read fpscr register into arm */
sl@0
   663
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   664
    
sl@0
   665
    asm("mov           fp, #3");                                                    
sl@0
   666
    asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 8 */     
sl@0
   667
    
sl@0
   668
    //asm("fmxr          fpscr, fp");      
sl@0
   669
    VFP_FMXR(,VFP_XREG_FPSCR,11);    
sl@0
   670
                                               
sl@0
   671
    asm("vfp_subtract_f64_loop2:");                                                    
sl@0
   672
    
sl@0
   673
    //asm("fldmiad       r1!, {d4, d5, d6, d7}");    
sl@0
   674
    VFP_FLDMIAS(CC_AL,1,4,4);                                 
sl@0
   675
sl@0
   676
    //asm("fldmiad       r2!, {d8, d9, d10, d11}");                                    
sl@0
   677
    VFP_FLDMIAS(CC_AL,2,8,4);                                 
sl@0
   678
    
sl@0
   679
    //asm("faddd  d12, d4, d8");                                                
sl@0
   680
    VFP_FSUBD(,12,4,8);
sl@0
   681
    
sl@0
   682
    //asm("fstmiad       r0!, {d12, d13, d14, d15}");                                  
sl@0
   683
    VFP_FSTMIAS(CC_AL,0,12,4);                                 
sl@0
   684
    
sl@0
   685
    asm("subs          ip, ip, #1");                                                
sl@0
   686
    asm("bne           vfp_subtract_f64_loop2");                                   
sl@0
   687
    
sl@0
   688
    //asm("fmxr          fpscr, lr");                /* restore original fpscr */      
sl@0
   689
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   690
                                    
sl@0
   691
    asm("vfp_subtract_f64_end:");                                                      
sl@0
   692
    asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */   
sl@0
   693
}     
sl@0
   694
sl@0
   695
EXPORT_C __NAKED__ void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n)
sl@0
   696
{
sl@0
   697
    asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */   
sl@0
   698
    
sl@0
   699
    //asm("fldmias       r2, {s1}");                 /* load scalar value */      
sl@0
   700
    VFP_FLDMIAS(CC_AL,2,1,1);
sl@0
   701
    
sl@0
   702
    asm("ands          ip, r3, #7");               /* ip = n % 8 */                
sl@0
   703
    asm("beq           vfp_scalaradd_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */
sl@0
   704
    asm("vfp_scalaradd_f32_ns_loop1:");                                                  
sl@0
   705
    
sl@0
   706
    //asm("fldmias       r1!, {s0}");
sl@0
   707
    VFP_FLDMIAS(CC_AL,1,0,1);
sl@0
   708
    
sl@0
   709
    //asm("FADDS  s2, s0, s1");   
sl@0
   710
    VFP_FADDS(CC_AL,2,0,1);
sl@0
   711
    
sl@0
   712
    //asm("fstmias       r0!, {s2}");
sl@0
   713
    VFP_FSTMIAS(CC_AL,0,2,8);
sl@0
   714
    
sl@0
   715
    asm("subs          ip, ip, #1");                                               
sl@0
   716
    asm("bne           vfp_scalaradd_f32_ns_loop1");                                  
sl@0
   717
    asm("vfp_scalaradd_f32_ns_unroll:");                 /* unroll by 8 */               
sl@0
   718
    asm("movs          ip, r3, lsr #3");           /* ip = n / 8 */                
sl@0
   719
    asm("beq           vfp_scalaradd_f32_ns_end");    /* if ip == 0 goto finish */    
sl@0
   720
    
sl@0
   721
    //asm("fmrx          lr, fpscr");                /* read fpscr register into arm */\
sl@0
   722
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   723
    
sl@0
   724
    asm("mov           fp, #7");                                                   
sl@0
   725
    asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 8 */    
sl@0
   726
    
sl@0
   727
    //asm("fmxr          fpscr, fp");                                                
sl@0
   728
    VFP_FMXR(,VFP_XREG_FPSCR,11);
sl@0
   729
    
sl@0
   730
    asm("vfp_scalaradd_f32_ns_loop2:");                                                  
sl@0
   731
    //asm("fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");   
sl@0
   732
    VFP_FLDMIAS(CC_AL,1,8,8);
sl@0
   733
    
sl@0
   734
    //asm("FADDS  s24, s8, s1");    
sl@0
   735
    VFP_FADDS(CC_AL,24,8,1);
sl@0
   736
    
sl@0
   737
    //asm("fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");     
sl@0
   738
    VFP_FSTMIAS(CC_AL,0,24,8);
sl@0
   739
    
sl@0
   740
    asm("subs          ip, ip, #1");                                               
sl@0
   741
    asm("bne           vfp_scalaradd_f32_ns_loop2");      
sl@0
   742
    
sl@0
   743
    //asm("fmxr          fpscr, lr");                /* restore original fpscr */    
sl@0
   744
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   745
    
sl@0
   746
    asm("vfp_scalaradd_f32_ns_end:");                                                    
sl@0
   747
    asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */   
sl@0
   748
}    
sl@0
   749
sl@0
   750
EXPORT_C __NAKED__ void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n)
sl@0
   751
{
sl@0
   752
    asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */   
sl@0
   753
    
sl@0
   754
    //asm("fldmias       r2, {s1}");                 /* load scalar value */      
sl@0
   755
    VFP_FLDMIAS(CC_AL,2,1,1);
sl@0
   756
    
sl@0
   757
    asm("ands          ip, r3, #7");               /* ip = n % 8 */                
sl@0
   758
    asm("beq           vfp_scalarmultiply_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */
sl@0
   759
    asm("vfp_scalarmultiply_f32_ns_loop1:");                                                  
sl@0
   760
    
sl@0
   761
    //asm("fldmias       r1!, {s0}");
sl@0
   762
    VFP_FLDMIAS(CC_AL,1,0,1);
sl@0
   763
    
sl@0
   764
    //asm("FADDS  s2, s0, s1");   
sl@0
   765
    VFP_FMULS(CC_AL,2,0,1);
sl@0
   766
    
sl@0
   767
    //asm("fstmias       r0!, {s2}");
sl@0
   768
    VFP_FSTMIAS(CC_AL,0,2,8);
sl@0
   769
    
sl@0
   770
    asm("subs          ip, ip, #1");                                               
sl@0
   771
    asm("bne           vfp_scalarmultiply_f32_ns_loop1");                                  
sl@0
   772
    asm("vfp_scalarmultiply_f32_ns_unroll:");                 /* unroll by 8 */               
sl@0
   773
    asm("movs          ip, r3, lsr #3");           /* ip = n / 8 */                
sl@0
   774
    asm("beq           vfp_scalarmultiply_f32_ns_end");    /* if ip == 0 goto finish */    
sl@0
   775
    
sl@0
   776
    //asm("fmrx          lr, fpscr");                /* read fpscr register into arm */\
sl@0
   777
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   778
    
sl@0
   779
    asm("mov           fp, #7");                                                   
sl@0
   780
    asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 8 */    
sl@0
   781
    
sl@0
   782
    //asm("fmxr          fpscr, fp");                                                
sl@0
   783
    VFP_FMXR(,VFP_XREG_FPSCR,11);
sl@0
   784
    
sl@0
   785
    asm("vfp_scalarmultiply_f32_ns_loop2:");                                                  
sl@0
   786
    //asm("fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15}");   
sl@0
   787
    VFP_FLDMIAS(CC_AL,1,8,8);
sl@0
   788
    
sl@0
   789
    //asm("FADDS  s24, s8, s1");    
sl@0
   790
    VFP_FMULS(CC_AL,24,8,1);
sl@0
   791
    
sl@0
   792
    //asm("fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31}");     
sl@0
   793
    VFP_FSTMIAS(CC_AL,0,24,8);
sl@0
   794
    
sl@0
   795
    asm("subs          ip, ip, #1");                                               
sl@0
   796
    asm("bne           vfp_scalarmultiply_f32_ns_loop2");      
sl@0
   797
    
sl@0
   798
    //asm("fmxr          fpscr, lr");                /* restore original fpscr */    
sl@0
   799
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   800
    
sl@0
   801
    asm("vfp_scalarmultiply_f32_ns_end:");                                                    
sl@0
   802
    asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */   
sl@0
   803
}    
sl@0
   804
sl@0
   805
EXPORT_C __NAKED__ void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n)
sl@0
   806
{                                                       
sl@0
   807
    asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */   
sl@0
   808
    
sl@0
   809
    //asm("fldmiad       r2, {d1}");                 /* load scalar value */  
sl@0
   810
     VFP_FLDMIAD(CC_AL,2,1,1);
sl@0
   811
    
sl@0
   812
    asm("ands          ip, r3, #3");               /* ip = n % 3 */                
sl@0
   813
    asm("beq           vfp_scalaradd_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */
sl@0
   814
    asm("vfp_scalaradd_f64_ns_loop1:");                                                  
sl@0
   815
    //asm("fldmiad       r1!, {d0}");   
sl@0
   816
    VFP_FLDMIAD(CC_AL,1,0,1);
sl@0
   817
    
sl@0
   818
    //asm("VFP_FADDD  d2, d0, d1");    
sl@0
   819
    VFP_FADDD(,2,0,1);
sl@0
   820
    
sl@0
   821
    //asm("fstmiad       r0!, {d2}");
sl@0
   822
    VFP_FSTMIAD(CC_AL,0,2,1);
sl@0
   823
    
sl@0
   824
    asm("subs          ip, ip, #1");                                               
sl@0
   825
    asm("bne           vfp_scalaradd_f64_ns_loop1");                                  
sl@0
   826
    asm("vfp_scalaradd_f64_ns_unroll:");                 /* unroll by 4 */               
sl@0
   827
    asm("movs          ip, r3, lsr #2");           /* ip = n / 4 */                
sl@0
   828
    asm("beq           vfp_scalaradd_f64_ns_end");    /* if ip == 0 goto finish */    
sl@0
   829
    
sl@0
   830
    //asm("fmrx          lr, fpscr");                /* read fpscr register into arm */\
sl@0
   831
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   832
    
sl@0
   833
    asm("mov           fp, #3");                                                   
sl@0
   834
    asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 4 */    
sl@0
   835
    
sl@0
   836
    //asm("fmxr          fpscr, fp");                                                
sl@0
   837
    VFP_FMXR(,VFP_XREG_FPSCR,11);
sl@0
   838
    
sl@0
   839
    asm("vfp_scalaradd_f64_ns_loop2:");                                                  
sl@0
   840
    
sl@0
   841
    //asm("fldmiad       r1!, {d4, d5, d6, d7}"); 
sl@0
   842
    VFP_FLDMIAD(CC_AL,1,4,4);
sl@0
   843
    
sl@0
   844
    //asm("VFP_FADDD  d12, d4, d1");   
sl@0
   845
    VFP_FADDD(,12,4,1);
sl@0
   846
    
sl@0
   847
    //asm("fstmiad       r0!, {d12, d13, d14, d15}"); 
sl@0
   848
    VFP_FSTMIAD(CC_AL,0,12,4);
sl@0
   849
    
sl@0
   850
    asm("subs          ip, ip, #1");                                               
sl@0
   851
    asm("bne           vfp_scalaradd_f64_ns_loop2");                                  
sl@0
   852
    
sl@0
   853
    //asm("fmxr          fpscr, lr");                /* restore original fpscr */    
sl@0
   854
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   855
    
sl@0
   856
    asm("vfp_scalaradd_f64_ns_end:");                                                    
sl@0
   857
    asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */   
sl@0
   858
}   
sl@0
   859
	
sl@0
   860
EXPORT_C __NAKED__ void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n)
sl@0
   861
{
sl@0
   862
	                                                       
sl@0
   863
    asm("stmdb         sp!, {fp, lr}");            /* save registers to stack */   
sl@0
   864
    
sl@0
   865
    //asm("fldmiad       r2, {d1}");                 /* load scalar value */  
sl@0
   866
     VFP_FLDMIAD(CC_AL,2,1,1);
sl@0
   867
    
sl@0
   868
    asm("ands          ip, r3, #3");               /* ip = n % 3 */                
sl@0
   869
    asm("beq           vfp_scalarmultiply_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */
sl@0
   870
    asm("vfp_scalarmultiply_f64_ns_loop1:");                                                  
sl@0
   871
    //asm("fldmiad       r1!, {d0}");   
sl@0
   872
    VFP_FLDMIAD(CC_AL,1,0,1);
sl@0
   873
    
sl@0
   874
    //asm("VFP_FADDD  d2, d0, d1");    
sl@0
   875
    VFP_FMULD(,2,0,1);
sl@0
   876
    
sl@0
   877
    //asm("fstmiad       r0!, {d2}");
sl@0
   878
    VFP_FSTMIAD(CC_AL,0,2,1);
sl@0
   879
    
sl@0
   880
    asm("subs          ip, ip, #1");                                               
sl@0
   881
    asm("bne           vfp_scalarmultiply_f64_ns_loop1");                                  
sl@0
   882
    asm("vfp_scalarmultiply_f64_ns_unroll:");                 /* unroll by 4 */               
sl@0
   883
    asm("movs          ip, r3, lsr #2");           /* ip = n / 4 */                
sl@0
   884
    asm("beq           vfp_scalarmultiply_f64_ns_end");    /* if ip == 0 goto finish */    
sl@0
   885
    
sl@0
   886
    //asm("fmrx          lr, fpscr");                /* read fpscr register into arm */\
sl@0
   887
    VFP_FMRX(,14,VFP_XREG_FPSCR);
sl@0
   888
    
sl@0
   889
    asm("mov           fp, #3");                                                   
sl@0
   890
    asm("orr           fp, lr, fp, lsl #16");      /* set vector lenght to 4 */    
sl@0
   891
    
sl@0
   892
    //asm("fmxr          fpscr, fp");                                                
sl@0
   893
    VFP_FMXR(,VFP_XREG_FPSCR,11);
sl@0
   894
    
sl@0
   895
    asm("vfp_scalarmultiply_f64_ns_loop2:");                                                  
sl@0
   896
    
sl@0
   897
    //asm("fldmiad       r1!, {d4, d5, d6, d7}"); 
sl@0
   898
    VFP_FLDMIAD(CC_AL,1,4,4);
sl@0
   899
    
sl@0
   900
    //asm("VFP_FADDD  d12, d4, d1");   
sl@0
   901
    VFP_FMULD(,12,4,1);
sl@0
   902
    
sl@0
   903
    //asm("fstmiad       r0!, {d12, d13, d14, d15}"); 
sl@0
   904
    VFP_FSTMIAD(CC_AL,0,12,4);
sl@0
   905
    
sl@0
   906
    asm("subs          ip, ip, #1");                                               
sl@0
   907
    asm("bne           vfp_scalarmultiply_f64_ns_loop2");                                  
sl@0
   908
    
sl@0
   909
    //asm("fmxr          fpscr, lr");                /* restore original fpscr */    
sl@0
   910
    VFP_FMXR(,VFP_XREG_FPSCR,14);
sl@0
   911
    
sl@0
   912
    asm("vfp_scalarmultiply_f64_ns_end:");                                                    
sl@0
   913
    asm("ldmia         sp!, {fp, pc}");        /* recovering from stack and return */  
sl@0
   914
sl@0
   915
}
sl@0
   916
	
sl@0
   917
		
sl@0
   918
}
sl@0
   919
#endif