os/ossrv/genericopenlibs/liboil/src/arm/math_vfp_asm.s
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
 * Copyright (c) 2007
sl@0
     3
 *	Josep Torra <josep@fluendo.com>.  All rights reserved.
sl@0
     4
 *
sl@0
     5
 * Redistribution and use in source and binary forms, with or without
sl@0
     6
 * modification, are permitted provided that the following conditions
sl@0
     7
 * are met:
sl@0
     8
 * 1. Redistributions of source code must retain the above copyright
sl@0
     9
 *    notice, this list of conditions and the following disclaimer.
sl@0
    10
 * 2. Redistributions in binary form must reproduce the above copyright
sl@0
    11
 *    notice, this list of conditions and the following disclaimer in the
sl@0
    12
 *    documentation and/or other materials provided with the distribution.
sl@0
    13
 *
sl@0
    14
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
sl@0
    15
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
sl@0
    16
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
sl@0
    17
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
sl@0
    18
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
sl@0
    19
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
sl@0
    20
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
sl@0
    21
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
sl@0
    22
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
sl@0
    23
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
sl@0
    24
 * SUCH DAMAGE.
sl@0
    25
 */
sl@0
    26
sl@0
    27
#if __VFP_FP__
sl@0
    28
/* 
sl@0
    29
** compile with -mcpu=arm1136j-s -mfpu=vfp -mfloat-abi=softfp
sl@0
    30
**
sl@0
    31
** void vfp_add_f32 (float *d, const float *s1, const float *s2, int n);
sl@0
    32
** void vfp_add_f64 (double *d, const double *s1, const double *s2, int n);
sl@0
    33
** void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n);
sl@0
    34
** void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n);
sl@0
    35
** void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n);
sl@0
    36
** void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n);
sl@0
    37
** void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n);
sl@0
    38
** void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n);
sl@0
    39
**
sl@0
    40
** d:   $r0     |   s1: $r1     | s2:  $r2     |   n:  $r3     |
sl@0
    41
**
sl@0
    42
*/
sl@0
    43
sl@0
    44
#define UNROLL_F32_TEMPLATE(fname,finst) \
sl@0
    45
  .global vfp_ ## fname ## ;                                                  \
sl@0
    46
  vfp_ ## fname ## :                                                          \
sl@0
    47
    stmdb         sp!, {fp, lr};            /* save registers to stack */     \
sl@0
    48
    ands          ip, r3, #7;               /* ip = n % 8 */                  \
sl@0
    49
    beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
sl@0
    50
  vfp_ ## fname ## _loop1:                                                    \
sl@0
    51
    fldmias       r1!, {s0};                                                  \
sl@0
    52
    fldmias       r2!, {s1};                                                  \
sl@0
    53
    ## finst ##s  s2, s0, s1;                                                 \
sl@0
    54
    fstmias       r0!, {s2};                                                  \
sl@0
    55
    subs          ip, ip, #1;                                                 \
sl@0
    56
    bne           vfp_ ## fname ## _loop1;                                    \
sl@0
    57
  vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \
sl@0
    58
    movs          ip, r3, lsr #3;           /* ip = n / 8 */                  \
sl@0
    59
    beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
sl@0
    60
    fmrx          lr, fpscr;                /* read fpscr register into arm */\
sl@0
    61
    mov           fp, #7;                                                     \
sl@0
    62
    orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
sl@0
    63
    fmxr          fpscr, fp;                                                  \
sl@0
    64
  vfp_ ## fname ## _loop2:                                                    \
sl@0
    65
    fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \
sl@0
    66
    fldmias       r2!, {s16, s17, s18, s19, s20, s21, s22, s23};              \
sl@0
    67
    ## finst ##s  s24, s8, s16;                                               \
sl@0
    68
    fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \
sl@0
    69
    subs          ip, ip, #1;                                                 \
sl@0
    70
    bne           vfp_ ## fname ## _loop2;                                    \
sl@0
    71
    fmxr          fpscr, lr;                /* restore original fpscr */      \
sl@0
    72
  vfp_ ## fname ## _end:                                                      \
sl@0
    73
    ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
sl@0
    74
sl@0
    75
#define UNROLL_F64_TEMPLATE(fname,finst) \
sl@0
    76
  .global vfp_ ## fname ## ;                                                  \
sl@0
    77
  vfp_ ## fname ## :                                                          \
sl@0
    78
    stmdb         sp!, {fp, lr};            /* save registers to stack */     \
sl@0
    79
    ands          ip, r3, #3;               /* ip = n % 3 */                  \
sl@0
    80
    beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
sl@0
    81
  vfp_ ## fname ## _loop1:                                                    \
sl@0
    82
    fldmiad       r1!, {d0};                                                  \
sl@0
    83
    fldmiad       r2!, {d1};                                                  \
sl@0
    84
    ## finst ##d  d2, d0, d1;                                                 \
sl@0
    85
    fstmiad       r0!, {d2};                                                  \
sl@0
    86
    subs          ip, ip, #1;                                                 \
sl@0
    87
    bne           vfp_ ## fname ## _loop1;                                    \
sl@0
    88
  vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \
sl@0
    89
    movs          ip, r3, lsr #2;           /* ip = n / 4 */                  \
sl@0
    90
    beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
sl@0
    91
    fmrx          lr, fpscr;                /* read fpscr register into arm */\
sl@0
    92
    mov           fp, #3;                                                     \
sl@0
    93
    orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
sl@0
    94
    fmxr          fpscr, fp;                                                  \
sl@0
    95
  vfp_ ## fname ## _loop2:                                                    \
sl@0
    96
    fldmiad       r1!, {d4, d5, d6, d7};                                      \
sl@0
    97
    fldmiad       r2!, {d8, d9, d10, d11};                                    \
sl@0
    98
    ## finst ##d  d12, d4, d8;                                                \
sl@0
    99
    fstmiad       r0!, {d12, d13, d14, d15};                                  \
sl@0
   100
    subs          ip, ip, #1;                                                 \
sl@0
   101
    bne           vfp_ ## fname ## _loop2;                                    \
sl@0
   102
    fmxr          fpscr, lr;                /* restore original fpscr */      \
sl@0
   103
  vfp_ ## fname ## _end:                                                      \
sl@0
   104
    ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
sl@0
   105
sl@0
   106
.align 2
sl@0
   107
UNROLL_F32_TEMPLATE(add_f32,fadd);
sl@0
   108
UNROLL_F64_TEMPLATE(add_f64,fadd);
sl@0
   109
sl@0
   110
UNROLL_F32_TEMPLATE(divide_f32,fdiv);
sl@0
   111
UNROLL_F64_TEMPLATE(divide_f64,fdiv);
sl@0
   112
sl@0
   113
UNROLL_F32_TEMPLATE(multiply_f32,fmul);
sl@0
   114
UNROLL_F64_TEMPLATE(multiply_f64,fmul);
sl@0
   115
sl@0
   116
UNROLL_F32_TEMPLATE(subtract_f32,fsub);
sl@0
   117
UNROLL_F64_TEMPLATE(subtract_f64,fsub);
sl@0
   118
sl@0
   119
#undef UNROLL_F32_TEMPLATE
sl@0
   120
#undef UNROLL_F64_TEMPLATE
sl@0
   121
sl@0
   122
/* 
sl@0
   123
**
sl@0
   124
** void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n);
sl@0
   125
** void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n);
sl@0
   126
** void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n);
sl@0
   127
** void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n);
sl@0
   128
**
sl@0
   129
** d:   $r0     |   s1: $r1     | s2_1:  $r2   |   n:  $r3     |
sl@0
   130
**
sl@0
   131
*/
sl@0
   132
#define UNROLL_F32_TEMPLATE(fname,finst) \
sl@0
   133
  .global vfp_ ## fname ## ;                                                  \
sl@0
   134
  vfp_ ## fname ## :                                                          \
sl@0
   135
    stmdb         sp!, {fp, lr};            /* save registers to stack */     \
sl@0
   136
    fldmias       r2, {s1};                 /* load scalar value */           \
sl@0
   137
    ands          ip, r3, #7;               /* ip = n % 8 */                  \
sl@0
   138
    beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
sl@0
   139
  vfp_ ## fname ## _loop1:                                                    \
sl@0
   140
    fldmias       r1!, {s0};                                                  \
sl@0
   141
    ## finst ##s  s2, s0, s1;                                                 \
sl@0
   142
    fstmias       r0!, {s2};                                                  \
sl@0
   143
    subs          ip, ip, #1;                                                 \
sl@0
   144
    bne           vfp_ ## fname ## _loop1;                                    \
sl@0
   145
  vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \
sl@0
   146
    movs          ip, r3, lsr #3;           /* ip = n / 8 */                  \
sl@0
   147
    beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
sl@0
   148
    fmrx          lr, fpscr;                /* read fpscr register into arm */\
sl@0
   149
    mov           fp, #7;                                                     \
sl@0
   150
    orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
sl@0
   151
    fmxr          fpscr, fp;                                                  \
sl@0
   152
  vfp_ ## fname ## _loop2:                                                    \
sl@0
   153
    fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \
sl@0
   154
    ## finst ##s  s24, s8, s1;                                                \
sl@0
   155
    fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \
sl@0
   156
    subs          ip, ip, #1;                                                 \
sl@0
   157
    bne           vfp_ ## fname ## _loop2;                                    \
sl@0
   158
    fmxr          fpscr, lr;                /* restore original fpscr */      \
sl@0
   159
  vfp_ ## fname ## _end:                                                      \
sl@0
   160
    ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
sl@0
   161
sl@0
   162
#define UNROLL_F64_TEMPLATE(fname,finst) \
sl@0
   163
  .global vfp_ ## fname ## ;                                                  \
sl@0
   164
  vfp_ ## fname ## :                                                          \
sl@0
   165
    stmdb         sp!, {fp, lr};            /* save registers to stack */     \
sl@0
   166
    fldmiad       r2, {d1};                 /* load scalar value */           \
sl@0
   167
    ands          ip, r3, #3;               /* ip = n % 3 */                  \
sl@0
   168
    beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
sl@0
   169
  vfp_ ## fname ## _loop1:                                                    \
sl@0
   170
    fldmiad       r1!, {d0};                                                  \
sl@0
   171
    ## finst ##d  d2, d0, d1;                                                 \
sl@0
   172
    fstmiad       r0!, {d2};                                                  \
sl@0
   173
    subs          ip, ip, #1;                                                 \
sl@0
   174
    bne           vfp_ ## fname ## _loop1;                                    \
sl@0
   175
  vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \
sl@0
   176
    movs          ip, r3, lsr #2;           /* ip = n / 4 */                  \
sl@0
   177
    beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
sl@0
   178
    fmrx          lr, fpscr;                /* read fpscr register into arm */\
sl@0
   179
    mov           fp, #3;                                                     \
sl@0
   180
    orr           fp, lr, fp, lsl #16;      /* set vector lenght to 4 */      \
sl@0
   181
    fmxr          fpscr, fp;                                                  \
sl@0
   182
  vfp_ ## fname ## _loop2:                                                    \
sl@0
   183
    fldmiad       r1!, {d4, d5, d6, d7};                                      \
sl@0
   184
    ## finst ##d  d12, d4, d1;                                                \
sl@0
   185
    fstmiad       r0!, {d12, d13, d14, d15};                                  \
sl@0
   186
    subs          ip, ip, #1;                                                 \
sl@0
   187
    bne           vfp_ ## fname ## _loop2;                                    \
sl@0
   188
    fmxr          fpscr, lr;                /* restore original fpscr */      \
sl@0
   189
  vfp_ ## fname ## _end:                                                      \
sl@0
   190
    ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
sl@0
   191
sl@0
   192
UNROLL_F32_TEMPLATE(scalaradd_f32_ns,fadd);
sl@0
   193
UNROLL_F64_TEMPLATE(scalaradd_f64_ns,fadd);
sl@0
   194
sl@0
   195
UNROLL_F32_TEMPLATE(scalarmultiply_f32_ns,fmul);
sl@0
   196
UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul);
sl@0
   197
sl@0
   198
#undef UNROLL_F32_TEMPLATE
sl@0
   199
#undef UNROLL_F64_TEMPLATE
sl@0
   200
sl@0
   201
/* 
sl@0
   202
**
sl@0
   203
** void vfp_abs_f32_f32_ns(float *d, const float *s, int n);
sl@0
   204
** void vfp_abs_f64_f64_ns(double *d, const double *s, int n);
sl@0
   205
** void vfp_negative_f32(float *d, const float *s, int n);
sl@0
   206
** void vfp_negative_f64(double *d, const double *s, int n);
sl@0
   207
**
sl@0
   208
** d:   $r0     |   s: $r1      |   n:  $r2     |
sl@0
   209
**
sl@0
   210
*/
sl@0
   211
#define UNROLL_F32_TEMPLATE(fname,finst) \
sl@0
   212
  .global vfp_ ## fname ## ;                                                  \
sl@0
   213
  vfp_ ## fname ## :                                                          \
sl@0
   214
    stmdb         sp!, {fp, lr};            /* save registers to stack */     \
sl@0
   215
    ands          ip, r2, #7;               /* ip = n % 8 */                  \
sl@0
   216
    beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
sl@0
   217
  vfp_ ## fname ## _loop1:                                                    \
sl@0
   218
    fldmias       r1!, {s0};                                                  \
sl@0
   219
    ## finst ##s  s2, s0;                                                     \
sl@0
   220
    fstmias       r0!, {s2};                                                  \
sl@0
   221
    subs          ip, ip, #1;                                                 \
sl@0
   222
    bne           vfp_ ## fname ## _loop1;                                    \
sl@0
   223
  vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \
sl@0
   224
    movs          ip, r2, lsr #3;           /* ip = n / 8 */                  \
sl@0
   225
    beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
sl@0
   226
    fmrx          lr, fpscr;                /* read fpscr register into arm */\
sl@0
   227
    mov           fp, #7;                                                     \
sl@0
   228
    orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
sl@0
   229
    fmxr          fpscr, fp;                                                  \
sl@0
   230
  vfp_ ## fname ## _loop2:                                                    \
sl@0
   231
    fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \
sl@0
   232
    ## finst ##s  s24, s8;                                                    \
sl@0
   233
    fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \
sl@0
   234
    subs          ip, ip, #1;                                                 \
sl@0
   235
    bne           vfp_ ## fname ## _loop2;                                    \
sl@0
   236
    fmxr          fpscr, lr;                /* restore original fpscr */      \
sl@0
   237
  vfp_ ## fname ## _end:                                                      \
sl@0
   238
    ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
sl@0
   239
sl@0
   240
#define UNROLL_F64_TEMPLATE(fname,finst) \
sl@0
   241
  .global vfp_ ## fname ## ;                                                  \
sl@0
   242
  vfp_ ## fname ## :                                                          \
sl@0
   243
    stmdb         sp!, {fp, lr};            /* save registers to stack */     \
sl@0
   244
    ands          ip, r2, #3;               /* ip = n % 3 */                  \
sl@0
   245
    beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
sl@0
   246
  vfp_ ## fname ## _loop1:                                                    \
sl@0
   247
    fldmiad       r1!, {d0};                                                  \
sl@0
   248
    ## finst ##d  d2, d0;                                                     \
sl@0
   249
    fstmiad       r0!, {d2};                                                  \
sl@0
   250
    subs          ip, ip, #1;                                                 \
sl@0
   251
    bne           vfp_ ## fname ## _loop1;                                    \
sl@0
   252
  vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \
sl@0
   253
    movs          ip, r2, lsr #2;           /* ip = n / 4 */                  \
sl@0
   254
    beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
sl@0
   255
    fmrx          lr, fpscr;                /* read fpscr register into arm */\
sl@0
   256
    mov           fp, #3;                                                     \
sl@0
   257
    orr           fp, lr, fp, lsl #16;      /* set vector lenght to 4 */      \
sl@0
   258
    fmxr          fpscr, fp;                                                  \
sl@0
   259
  vfp_ ## fname ## _loop2:                                                    \
sl@0
   260
    fldmiad       r1!, {d4, d5, d6, d7};                                      \
sl@0
   261
    ## finst ##d  d12, d4;                                                    \
sl@0
   262
    fstmiad       r0!, {d12, d13, d14, d15};                                  \
sl@0
   263
    subs          ip, ip, #1;                                                 \
sl@0
   264
    bne           vfp_ ## fname ## _loop2;                                    \
sl@0
   265
    fmxr          fpscr, lr;                /* restore original fpscr */      \
sl@0
   266
  vfp_ ## fname ## _end:                                                      \
sl@0
   267
    ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
sl@0
   268
sl@0
   269
UNROLL_F32_TEMPLATE(abs_f32_f32_ns,fabs);
sl@0
   270
UNROLL_F64_TEMPLATE(abs_f64_f64_ns,fabs);
sl@0
   271
sl@0
   272
UNROLL_F32_TEMPLATE(negative_f32,fneg);
sl@0
   273
UNROLL_F64_TEMPLATE(negative_f64,fneg);
sl@0
   274
sl@0
   275
#undef UNROLL_F32_TEMPLATE
sl@0
   276
#undef UNROLL_F64_TEMPLATE
sl@0
   277
#endif