os/ossrv/genericopenlibs/liboil/src/arm/math_vfp_asm.s
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2  * Copyright (c) 2007
     3  *	Josep Torra <josep@fluendo.com>.  All rights reserved.
     4  *
     5  * Redistribution and use in source and binary forms, with or without
     6  * modification, are permitted provided that the following conditions
     7  * are met:
     8  * 1. Redistributions of source code must retain the above copyright
     9  *    notice, this list of conditions and the following disclaimer.
    10  * 2. Redistributions in binary form must reproduce the above copyright
    11  *    notice, this list of conditions and the following disclaimer in the
    12  *    documentation and/or other materials provided with the distribution.
    13  *
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    24  * SUCH DAMAGE.
    25  */
    26 
    27 #if __VFP_FP__
    28 /* 
    29 ** compile with -mcpu=arm1136j-s -mfpu=vfp -mfloat-abi=softfp
    30 **
    31 ** void vfp_add_f32 (float *d, const float *s1, const float *s2, int n);
    32 ** void vfp_add_f64 (double *d, const double *s1, const double *s2, int n);
    33 ** void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n);
    34 ** void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n);
    35 ** void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n);
    36 ** void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n);
    37 ** void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n);
    38 ** void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n);
    39 **
    40 ** d:   $r0     |   s1: $r1     | s2:  $r2     |   n:  $r3     |
    41 **
    42 */
    43 
    44 #define UNROLL_F32_TEMPLATE(fname,finst) \
    45   .global vfp_ ## fname ## ;                                                  \
    46   vfp_ ## fname ## :                                                          \
    47     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
    48     ands          ip, r3, #7;               /* ip = n % 8 */                  \
    49     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
    50   vfp_ ## fname ## _loop1:                                                    \
    51     fldmias       r1!, {s0};                                                  \
    52     fldmias       r2!, {s1};                                                  \
    53     ## finst ##s  s2, s0, s1;                                                 \
    54     fstmias       r0!, {s2};                                                  \
    55     subs          ip, ip, #1;                                                 \
    56     bne           vfp_ ## fname ## _loop1;                                    \
    57   vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \
    58     movs          ip, r3, lsr #3;           /* ip = n / 8 */                  \
    59     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
    60     fmrx          lr, fpscr;                /* read fpscr register into arm */\
    61     mov           fp, #7;                                                     \
    62     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
    63     fmxr          fpscr, fp;                                                  \
    64   vfp_ ## fname ## _loop2:                                                    \
    65     fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \
    66     fldmias       r2!, {s16, s17, s18, s19, s20, s21, s22, s23};              \
    67     ## finst ##s  s24, s8, s16;                                               \
    68     fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \
    69     subs          ip, ip, #1;                                                 \
    70     bne           vfp_ ## fname ## _loop2;                                    \
    71     fmxr          fpscr, lr;                /* restore original fpscr */      \
    72   vfp_ ## fname ## _end:                                                      \
    73     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
    74 
    75 #define UNROLL_F64_TEMPLATE(fname,finst) \
    76   .global vfp_ ## fname ## ;                                                  \
    77   vfp_ ## fname ## :                                                          \
    78     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
    79     ands          ip, r3, #3;               /* ip = n % 3 */                  \
    80     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
    81   vfp_ ## fname ## _loop1:                                                    \
    82     fldmiad       r1!, {d0};                                                  \
    83     fldmiad       r2!, {d1};                                                  \
    84     ## finst ##d  d2, d0, d1;                                                 \
    85     fstmiad       r0!, {d2};                                                  \
    86     subs          ip, ip, #1;                                                 \
    87     bne           vfp_ ## fname ## _loop1;                                    \
    88   vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \
    89     movs          ip, r3, lsr #2;           /* ip = n / 4 */                  \
    90     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
    91     fmrx          lr, fpscr;                /* read fpscr register into arm */\
    92     mov           fp, #3;                                                     \
    93     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
    94     fmxr          fpscr, fp;                                                  \
    95   vfp_ ## fname ## _loop2:                                                    \
    96     fldmiad       r1!, {d4, d5, d6, d7};                                      \
    97     fldmiad       r2!, {d8, d9, d10, d11};                                    \
    98     ## finst ##d  d12, d4, d8;                                                \
    99     fstmiad       r0!, {d12, d13, d14, d15};                                  \
   100     subs          ip, ip, #1;                                                 \
   101     bne           vfp_ ## fname ## _loop2;                                    \
   102     fmxr          fpscr, lr;                /* restore original fpscr */      \
   103   vfp_ ## fname ## _end:                                                      \
   104     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
   105 
   106 .align 2
   107 UNROLL_F32_TEMPLATE(add_f32,fadd);
   108 UNROLL_F64_TEMPLATE(add_f64,fadd);
   109 
   110 UNROLL_F32_TEMPLATE(divide_f32,fdiv);
   111 UNROLL_F64_TEMPLATE(divide_f64,fdiv);
   112 
   113 UNROLL_F32_TEMPLATE(multiply_f32,fmul);
   114 UNROLL_F64_TEMPLATE(multiply_f64,fmul);
   115 
   116 UNROLL_F32_TEMPLATE(subtract_f32,fsub);
   117 UNROLL_F64_TEMPLATE(subtract_f64,fsub);
   118 
   119 #undef UNROLL_F32_TEMPLATE
   120 #undef UNROLL_F64_TEMPLATE
   121 
   122 /* 
   123 **
   124 ** void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n);
   125 ** void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n);
   126 ** void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n);
   127 ** void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n);
   128 **
   129 ** d:   $r0     |   s1: $r1     | s2_1:  $r2   |   n:  $r3     |
   130 **
   131 */
   132 #define UNROLL_F32_TEMPLATE(fname,finst) \
   133   .global vfp_ ## fname ## ;                                                  \
   134   vfp_ ## fname ## :                                                          \
   135     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
   136     fldmias       r2, {s1};                 /* load scalar value */           \
   137     ands          ip, r3, #7;               /* ip = n % 8 */                  \
   138     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
   139   vfp_ ## fname ## _loop1:                                                    \
   140     fldmias       r1!, {s0};                                                  \
   141     ## finst ##s  s2, s0, s1;                                                 \
   142     fstmias       r0!, {s2};                                                  \
   143     subs          ip, ip, #1;                                                 \
   144     bne           vfp_ ## fname ## _loop1;                                    \
   145   vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \
   146     movs          ip, r3, lsr #3;           /* ip = n / 8 */                  \
   147     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
   148     fmrx          lr, fpscr;                /* read fpscr register into arm */\
   149     mov           fp, #7;                                                     \
   150     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
   151     fmxr          fpscr, fp;                                                  \
   152   vfp_ ## fname ## _loop2:                                                    \
   153     fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \
   154     ## finst ##s  s24, s8, s1;                                                \
   155     fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \
   156     subs          ip, ip, #1;                                                 \
   157     bne           vfp_ ## fname ## _loop2;                                    \
   158     fmxr          fpscr, lr;                /* restore original fpscr */      \
   159   vfp_ ## fname ## _end:                                                      \
   160     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
   161 
   162 #define UNROLL_F64_TEMPLATE(fname,finst) \
   163   .global vfp_ ## fname ## ;                                                  \
   164   vfp_ ## fname ## :                                                          \
   165     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
   166     fldmiad       r2, {d1};                 /* load scalar value */           \
   167     ands          ip, r3, #3;               /* ip = n % 3 */                  \
   168     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
   169   vfp_ ## fname ## _loop1:                                                    \
   170     fldmiad       r1!, {d0};                                                  \
   171     ## finst ##d  d2, d0, d1;                                                 \
   172     fstmiad       r0!, {d2};                                                  \
   173     subs          ip, ip, #1;                                                 \
   174     bne           vfp_ ## fname ## _loop1;                                    \
   175   vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \
   176     movs          ip, r3, lsr #2;           /* ip = n / 4 */                  \
   177     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
   178     fmrx          lr, fpscr;                /* read fpscr register into arm */\
   179     mov           fp, #3;                                                     \
   180     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 4 */      \
   181     fmxr          fpscr, fp;                                                  \
   182   vfp_ ## fname ## _loop2:                                                    \
   183     fldmiad       r1!, {d4, d5, d6, d7};                                      \
   184     ## finst ##d  d12, d4, d1;                                                \
   185     fstmiad       r0!, {d12, d13, d14, d15};                                  \
   186     subs          ip, ip, #1;                                                 \
   187     bne           vfp_ ## fname ## _loop2;                                    \
   188     fmxr          fpscr, lr;                /* restore original fpscr */      \
   189   vfp_ ## fname ## _end:                                                      \
   190     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
   191 
   192 UNROLL_F32_TEMPLATE(scalaradd_f32_ns,fadd);
   193 UNROLL_F64_TEMPLATE(scalaradd_f64_ns,fadd);
   194 
   195 UNROLL_F32_TEMPLATE(scalarmultiply_f32_ns,fmul);
   196 UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul);
   197 
   198 #undef UNROLL_F32_TEMPLATE
   199 #undef UNROLL_F64_TEMPLATE
   200 
   201 /* 
   202 **
   203 ** void vfp_abs_f32_f32_ns(float *d, const float *s, int n);
   204 ** void vfp_abs_f64_f64_ns(double *d, const double *s, int n);
   205 ** void vfp_negative_f32(float *d, const float *s, int n);
   206 ** void vfp_negative_f64(double *d, const double *s, int n);
   207 **
   208 ** d:   $r0     |   s: $r1      |   n:  $r2     |
   209 **
   210 */
   211 #define UNROLL_F32_TEMPLATE(fname,finst) \
   212   .global vfp_ ## fname ## ;                                                  \
   213   vfp_ ## fname ## :                                                          \
   214     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
   215     ands          ip, r2, #7;               /* ip = n % 8 */                  \
   216     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
   217   vfp_ ## fname ## _loop1:                                                    \
   218     fldmias       r1!, {s0};                                                  \
   219     ## finst ##s  s2, s0;                                                     \
   220     fstmias       r0!, {s2};                                                  \
   221     subs          ip, ip, #1;                                                 \
   222     bne           vfp_ ## fname ## _loop1;                                    \
   223   vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \
   224     movs          ip, r2, lsr #3;           /* ip = n / 8 */                  \
   225     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
   226     fmrx          lr, fpscr;                /* read fpscr register into arm */\
   227     mov           fp, #7;                                                     \
   228     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
   229     fmxr          fpscr, fp;                                                  \
   230   vfp_ ## fname ## _loop2:                                                    \
   231     fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \
   232     ## finst ##s  s24, s8;                                                    \
   233     fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \
   234     subs          ip, ip, #1;                                                 \
   235     bne           vfp_ ## fname ## _loop2;                                    \
   236     fmxr          fpscr, lr;                /* restore original fpscr */      \
   237   vfp_ ## fname ## _end:                                                      \
   238     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
   239 
   240 #define UNROLL_F64_TEMPLATE(fname,finst) \
   241   .global vfp_ ## fname ## ;                                                  \
   242   vfp_ ## fname ## :                                                          \
   243     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
   244     ands          ip, r2, #3;               /* ip = n % 3 */                  \
   245     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
   246   vfp_ ## fname ## _loop1:                                                    \
   247     fldmiad       r1!, {d0};                                                  \
   248     ## finst ##d  d2, d0;                                                     \
   249     fstmiad       r0!, {d2};                                                  \
   250     subs          ip, ip, #1;                                                 \
   251     bne           vfp_ ## fname ## _loop1;                                    \
   252   vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \
   253     movs          ip, r2, lsr #2;           /* ip = n / 4 */                  \
   254     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
   255     fmrx          lr, fpscr;                /* read fpscr register into arm */\
   256     mov           fp, #3;                                                     \
   257     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 4 */      \
   258     fmxr          fpscr, fp;                                                  \
   259   vfp_ ## fname ## _loop2:                                                    \
   260     fldmiad       r1!, {d4, d5, d6, d7};                                      \
   261     ## finst ##d  d12, d4;                                                    \
   262     fstmiad       r0!, {d12, d13, d14, d15};                                  \
   263     subs          ip, ip, #1;                                                 \
   264     bne           vfp_ ## fname ## _loop2;                                    \
   265     fmxr          fpscr, lr;                /* restore original fpscr */      \
   266   vfp_ ## fname ## _end:                                                      \
   267     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
   268 
   269 UNROLL_F32_TEMPLATE(abs_f32_f32_ns,fabs);
   270 UNROLL_F64_TEMPLATE(abs_f64_f64_ns,fabs);
   271 
   272 UNROLL_F32_TEMPLATE(negative_f32,fneg);
   273 UNROLL_F64_TEMPLATE(negative_f64,fneg);
   274 
   275 #undef UNROLL_F32_TEMPLATE
   276 #undef UNROLL_F64_TEMPLATE
   277 #endif