Symaptic: os/ossrv/genericopenlibs/liboil/src/arm/math_vfp

     1 /*

     2  * Copyright (c) 2007

     3  *	Josep Torra <josep@fluendo.com>.  All rights reserved.

     4  *

     5  * Redistribution and use in source and binary forms, with or without

     6  * modification, are permitted provided that the following conditions

     7  * are met:

     8  * 1. Redistributions of source code must retain the above copyright

     9  *    notice, this list of conditions and the following disclaimer.

    10  * 2. Redistributions in binary form must reproduce the above copyright

    11  *    notice, this list of conditions and the following disclaimer in the

    12  *    documentation and/or other materials provided with the distribution.

    13  *

    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND

    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE

    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS

    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

    24  * SUCH DAMAGE.

    25  */

    27 #if __VFP_FP__

    28 /*

    29 ** compile with -mcpu=arm1136j-s -mfpu=vfp -mfloat-abi=softfp

    30 **

    31 ** void vfp_add_f32 (float *d, const float *s1, const float *s2, int n);

    32 ** void vfp_add_f64 (double *d, const double *s1, const double *s2, int n);

    33 ** void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n);

    34 ** void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n);

    35 ** void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n);

    36 ** void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n);

    37 ** void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n);

    38 ** void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n);

    39 **

    40 ** d:   $r0     |   s1: $r1     | s2:  $r2     |   n:  $r3     |

    41 **

    42 */

    44 #define UNROLL_F32_TEMPLATE(fname,finst) \

    45   .global vfp_ ## fname ## ;                                                  \

    46   vfp_ ## fname ## :                                                          \

    47     stmdb         sp!, {fp, lr};            /* save registers to stack */     \

    48     ands          ip, r3, #7;               /* ip = n % 8 */                  \

    49     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \

    50   vfp_ ## fname ## _loop1:                                                    \

    51     fldmias       r1!, {s0};                                                  \

    52     fldmias       r2!, {s1};                                                  \

    53     ## finst ##s  s2, s0, s1;                                                 \

    54     fstmias       r0!, {s2};                                                  \

    55     subs          ip, ip, #1;                                                 \

    56     bne           vfp_ ## fname ## _loop1;                                    \

    57   vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \

    58     movs          ip, r3, lsr #3;           /* ip = n / 8 */                  \

    59     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \

    60     fmrx          lr, fpscr;                /* read fpscr register into arm */\

    61     mov           fp, #7;                                                     \

    62     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \

    63     fmxr          fpscr, fp;                                                  \

    64   vfp_ ## fname ## _loop2:                                                    \

    65     fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \

    66     fldmias       r2!, {s16, s17, s18, s19, s20, s21, s22, s23};              \

    67     ## finst ##s  s24, s8, s16;                                               \

    68     fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \

    69     subs          ip, ip, #1;                                                 \

    70     bne           vfp_ ## fname ## _loop2;                                    \

    71     fmxr          fpscr, lr;                /* restore original fpscr */      \

    72   vfp_ ## fname ## _end:                                                      \

    73     ldmia         sp!, {fp, pc};        /* recovering from stack and return */

    75 #define UNROLL_F64_TEMPLATE(fname,finst) \

    76   .global vfp_ ## fname ## ;                                                  \

    77   vfp_ ## fname ## :                                                          \

    78     stmdb         sp!, {fp, lr};            /* save registers to stack */     \

    79     ands          ip, r3, #3;               /* ip = n % 3 */                  \

    80     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \

    81   vfp_ ## fname ## _loop1:                                                    \

    82     fldmiad       r1!, {d0};                                                  \

    83     fldmiad       r2!, {d1};                                                  \

    84     ## finst ##d  d2, d0, d1;                                                 \

    85     fstmiad       r0!, {d2};                                                  \

    86     subs          ip, ip, #1;                                                 \

    87     bne           vfp_ ## fname ## _loop1;                                    \

    88   vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \

    89     movs          ip, r3, lsr #2;           /* ip = n / 4 */                  \

    90     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \

    91     fmrx          lr, fpscr;                /* read fpscr register into arm */\

    92     mov           fp, #3;                                                     \

    93     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \

    94     fmxr          fpscr, fp;                                                  \

    95   vfp_ ## fname ## _loop2:                                                    \

    96     fldmiad       r1!, {d4, d5, d6, d7};                                      \

    97     fldmiad       r2!, {d8, d9, d10, d11};                                    \

    98     ## finst ##d  d12, d4, d8;                                                \

    99     fstmiad       r0!, {d12, d13, d14, d15};                                  \

   100     subs          ip, ip, #1;                                                 \

   101     bne           vfp_ ## fname ## _loop2;                                    \

   102     fmxr          fpscr, lr;                /* restore original fpscr */      \

   103   vfp_ ## fname ## _end:                                                      \

   104     ldmia         sp!, {fp, pc};        /* recovering from stack and return */

   106 .align 2

   107 UNROLL_F32_TEMPLATE(add_f32,fadd);

   108 UNROLL_F64_TEMPLATE(add_f64,fadd);

   110 UNROLL_F32_TEMPLATE(divide_f32,fdiv);

   111 UNROLL_F64_TEMPLATE(divide_f64,fdiv);

   113 UNROLL_F32_TEMPLATE(multiply_f32,fmul);

   114 UNROLL_F64_TEMPLATE(multiply_f64,fmul);

   116 UNROLL_F32_TEMPLATE(subtract_f32,fsub);

   117 UNROLL_F64_TEMPLATE(subtract_f64,fsub);

   119 #undef UNROLL_F32_TEMPLATE

   120 #undef UNROLL_F64_TEMPLATE

   122 /*

   123 **

   124 ** void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n);

   125 ** void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n);

   126 ** void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n);

   127 ** void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n);

   128 **

   129 ** d:   $r0     |   s1: $r1     | s2_1:  $r2   |   n:  $r3     |

   130 **

   131 */

   132 #define UNROLL_F32_TEMPLATE(fname,finst) \

   133   .global vfp_ ## fname ## ;                                                  \

   134   vfp_ ## fname ## :                                                          \

   135     stmdb         sp!, {fp, lr};            /* save registers to stack */     \

   136     fldmias       r2, {s1};                 /* load scalar value */           \

   137     ands          ip, r3, #7;               /* ip = n % 8 */                  \

   138     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \

   139   vfp_ ## fname ## _loop1:                                                    \

   140     fldmias       r1!, {s0};                                                  \

   141     ## finst ##s  s2, s0, s1;                                                 \

   142     fstmias       r0!, {s2};                                                  \

   143     subs          ip, ip, #1;                                                 \

   144     bne           vfp_ ## fname ## _loop1;                                    \

   145   vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \

   146     movs          ip, r3, lsr #3;           /* ip = n / 8 */                  \

   147     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \

   148     fmrx          lr, fpscr;                /* read fpscr register into arm */\

   149     mov           fp, #7;                                                     \

   150     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \

   151     fmxr          fpscr, fp;                                                  \

   152   vfp_ ## fname ## _loop2:                                                    \

   153     fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \

   154     ## finst ##s  s24, s8, s1;                                                \

   155     fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \

   156     subs          ip, ip, #1;                                                 \

   157     bne           vfp_ ## fname ## _loop2;                                    \

   158     fmxr          fpscr, lr;                /* restore original fpscr */      \

   159   vfp_ ## fname ## _end:                                                      \

   160     ldmia         sp!, {fp, pc};        /* recovering from stack and return */

   162 #define UNROLL_F64_TEMPLATE(fname,finst) \

   163   .global vfp_ ## fname ## ;                                                  \

   164   vfp_ ## fname ## :                                                          \

   165     stmdb         sp!, {fp, lr};            /* save registers to stack */     \

   166     fldmiad       r2, {d1};                 /* load scalar value */           \

   167     ands          ip, r3, #3;               /* ip = n % 3 */                  \

   168     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \

   169   vfp_ ## fname ## _loop1:                                                    \

   170     fldmiad       r1!, {d0};                                                  \

   171     ## finst ##d  d2, d0, d1;                                                 \

   172     fstmiad       r0!, {d2};                                                  \

   173     subs          ip, ip, #1;                                                 \

   174     bne           vfp_ ## fname ## _loop1;                                    \

   175   vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \

   176     movs          ip, r3, lsr #2;           /* ip = n / 4 */                  \

   177     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \

   178     fmrx          lr, fpscr;                /* read fpscr register into arm */\

   179     mov           fp, #3;                                                     \

   180     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 4 */      \

   181     fmxr          fpscr, fp;                                                  \

   182   vfp_ ## fname ## _loop2:                                                    \

   183     fldmiad       r1!, {d4, d5, d6, d7};                                      \

   184     ## finst ##d  d12, d4, d1;                                                \

   185     fstmiad       r0!, {d12, d13, d14, d15};                                  \

   186     subs          ip, ip, #1;                                                 \

   187     bne           vfp_ ## fname ## _loop2;                                    \

   188     fmxr          fpscr, lr;                /* restore original fpscr */      \

   189   vfp_ ## fname ## _end:                                                      \

   190     ldmia         sp!, {fp, pc};        /* recovering from stack and return */

   192 UNROLL_F32_TEMPLATE(scalaradd_f32_ns,fadd);

   193 UNROLL_F64_TEMPLATE(scalaradd_f64_ns,fadd);

   195 UNROLL_F32_TEMPLATE(scalarmultiply_f32_ns,fmul);

   196 UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul);

   198 #undef UNROLL_F32_TEMPLATE

   199 #undef UNROLL_F64_TEMPLATE

   201 /*

   202 **

   203 ** void vfp_abs_f32_f32_ns(float *d, const float *s, int n);

   204 ** void vfp_abs_f64_f64_ns(double *d, const double *s, int n);

   205 ** void vfp_negative_f32(float *d, const float *s, int n);

   206 ** void vfp_negative_f64(double *d, const double *s, int n);

   207 **

   208 ** d:   $r0     |   s: $r1      |   n:  $r2     |

   209 **

   210 */

   211 #define UNROLL_F32_TEMPLATE(fname,finst) \

   212   .global vfp_ ## fname ## ;                                                  \

   213   vfp_ ## fname ## :                                                          \

   214     stmdb         sp!, {fp, lr};            /* save registers to stack */     \

   215     ands          ip, r2, #7;               /* ip = n % 8 */                  \

   216     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \

   217   vfp_ ## fname ## _loop1:                                                    \

   218     fldmias       r1!, {s0};                                                  \

   219     ## finst ##s  s2, s0;                                                     \

   220     fstmias       r0!, {s2};                                                  \

   221     subs          ip, ip, #1;                                                 \

   222     bne           vfp_ ## fname ## _loop1;                                    \

   223   vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \

   224     movs          ip, r2, lsr #3;           /* ip = n / 8 */                  \

   225     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \

   226     fmrx          lr, fpscr;                /* read fpscr register into arm */\

   227     mov           fp, #7;                                                     \

   228     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \

   229     fmxr          fpscr, fp;                                                  \

   230   vfp_ ## fname ## _loop2:                                                    \

   231     fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \

   232     ## finst ##s  s24, s8;                                                    \

   233     fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \

   234     subs          ip, ip, #1;                                                 \

   235     bne           vfp_ ## fname ## _loop2;                                    \

   236     fmxr          fpscr, lr;                /* restore original fpscr */      \

   237   vfp_ ## fname ## _end:                                                      \

   238     ldmia         sp!, {fp, pc};        /* recovering from stack and return */

   240 #define UNROLL_F64_TEMPLATE(fname,finst) \

   241   .global vfp_ ## fname ## ;                                                  \

   242   vfp_ ## fname ## :                                                          \

   243     stmdb         sp!, {fp, lr};            /* save registers to stack */     \

   244     ands          ip, r2, #3;               /* ip = n % 3 */                  \

   245     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \

   246   vfp_ ## fname ## _loop1:                                                    \

   247     fldmiad       r1!, {d0};                                                  \

   248     ## finst ##d  d2, d0;                                                     \

   249     fstmiad       r0!, {d2};                                                  \

   250     subs          ip, ip, #1;                                                 \

   251     bne           vfp_ ## fname ## _loop1;                                    \

   252   vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \

   253     movs          ip, r2, lsr #2;           /* ip = n / 4 */                  \

   254     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \

   255     fmrx          lr, fpscr;                /* read fpscr register into arm */\

   256     mov           fp, #3;                                                     \

   257     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 4 */      \

   258     fmxr          fpscr, fp;                                                  \

   259   vfp_ ## fname ## _loop2:                                                    \

   260     fldmiad       r1!, {d4, d5, d6, d7};                                      \

   261     ## finst ##d  d12, d4;                                                    \

   262     fstmiad       r0!, {d12, d13, d14, d15};                                  \

   263     subs          ip, ip, #1;                                                 \

   264     bne           vfp_ ## fname ## _loop2;                                    \

   265     fmxr          fpscr, lr;                /* restore original fpscr */      \

   266   vfp_ ## fname ## _end:                                                      \

   267     ldmia         sp!, {fp, pc};        /* recovering from stack and return */

   269 UNROLL_F32_TEMPLATE(abs_f32_f32_ns,fabs);

   270 UNROLL_F64_TEMPLATE(abs_f64_f64_ns,fabs);

   272 UNROLL_F32_TEMPLATE(negative_f32,fneg);

   273 UNROLL_F64_TEMPLATE(negative_f64,fneg);

   275 #undef UNROLL_F32_TEMPLATE

   276 #undef UNROLL_F64_TEMPLATE

   277 #endif

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--