os/ossrv/genericopenlibs/liboil/src/math_sse.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2  * Copyright (c) 2005
     3  *	Eric Anholt.  All rights reserved.
     4  *
     5  * Redistribution and use in source and binary forms, with or without
     6  * modification, are permitted provided that the following conditions
     7  * are met:
     8  * 1. Redistributions of source code must retain the above copyright
     9  *    notice, this list of conditions and the following disclaimer.
    10  * 2. Redistributions in binary form must reproduce the above copyright
    11  *    notice, this list of conditions and the following disclaimer in the
    12  *    documentation and/or other materials provided with the distribution.
    13  *
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    24  * SUCH DAMAGE.
    25  */
    26 //Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
    27 
    28 #ifdef HAVE_CONFIG_H
    29 #include "config.h"
    30 #endif
    31 #include <liboilclasses.h>
    32 #include <liboilfunction.h>
    33 #include <emmintrin.h>
    34 #include <xmmintrin.h>
    35 
    36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
    37 
    38 SSE_FUNCTION static void
    39 add_f32_sse (float *dest, float *src1, float *src2, int n)
    40 {
    41   /* Initial operations to align the destination pointer */
    42   for (; ((long)dest & 15) && (n > 0); n--) {
    43     *dest++ = *src1++ + *src2++;
    44   }
    45   for (; n >= 4; n -= 4) {
    46     __m128 xmm0, xmm1;
    47     xmm0 = _mm_loadu_ps(src1);
    48     xmm1 = _mm_loadu_ps(src2);
    49     xmm0 = _mm_add_ps(xmm0, xmm1);
    50     _mm_store_ps(dest, xmm0);
    51     dest += 4;
    52     src1 += 4;
    53     src2 += 4;
    54   }
    55   for (; n > 0; n--) {
    56     *dest++ = *src1++ + *src2++;
    57   }
    58 }
    59 OIL_DEFINE_IMPL_FULL (add_f32_sse, add_f32, OIL_IMPL_FLAG_SSE);
    60 
    61 SSE_FUNCTION static void
    62 add_f64_sse2 (double *dest, double *src1, double *src2, int n)
    63 {
    64   __m128d xmm0, xmm1;
    65   while (((long)dest & 15) && (0 < n)) {
    66     *dest++ = *src1++ + *src2++;
    67     n--;
    68   }
    69   while (1 < n) {
    70     xmm0 = _mm_loadu_pd(src1);
    71     xmm1 = _mm_loadu_pd(src2);
    72     xmm0 = _mm_add_pd(xmm0, xmm1);
    73     _mm_store_pd(dest, xmm0);
    74     dest += 2;
    75     src1 += 2;
    76     src2 += 2;
    77     n -= 2;
    78   }
    79   while (0 < n) {
    80     *dest++ = *src1++ + *src2++;
    81     n--;
    82   }
    83 }
    84 OIL_DEFINE_IMPL_FULL (add_f64_sse2, add_f64, OIL_IMPL_FLAG_SSE2);
    85 
    86 SSE_FUNCTION static void
    87 add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n)
    88 {
    89   __m128d xmm0, xmm1;
    90   while (((long)dest & 15) && (0 < n)) {
    91     *dest++ = *src1++ + *src2++;
    92     n--;
    93   }
    94   while (3 < n) {
    95     xmm0 = _mm_loadu_pd(src1);
    96     xmm1 = _mm_loadu_pd(src2);
    97     xmm0 = _mm_add_pd(xmm0, xmm1);
    98     _mm_store_pd(dest, xmm0);
    99 
   100     xmm0 = _mm_loadu_pd(src1+2);
   101     xmm1 = _mm_loadu_pd(src2+2);
   102     xmm0 = _mm_add_pd(xmm0, xmm1);
   103     _mm_store_pd(dest+2, xmm0);
   104     dest += 4;
   105     src1 += 4;
   106     src2 += 4;
   107     n -= 4;
   108   }
   109   while (1 < n) {
   110     xmm0 = _mm_loadu_pd(src1);
   111     xmm1 = _mm_loadu_pd(src2);
   112     xmm0 = _mm_add_pd(xmm0, xmm1);
   113     _mm_store_pd(dest, xmm0);
   114     dest += 2;
   115     src1 += 2;
   116     src2 += 2;
   117     n -= 2;
   118   }
   119   while (0 < n) {
   120     *dest++ = *src1++ + *src2++;
   121     n--;
   122   }
   123 }
   124 OIL_DEFINE_IMPL_FULL (add_f64_sse2_unroll, add_f64, OIL_IMPL_FLAG_SSE2);
   125 
   126 SSE_FUNCTION static void
   127 subtract_f32_sse (float *dest, float *src1, float *src2, int n)
   128 {
   129   /* Initial operations to align the destination pointer */
   130   for (; ((long)dest & 15) && (n > 0); n--) {
   131     *dest++ = *src1++ - *src2++;
   132   }
   133   for (; n >= 4; n -= 4) {
   134     __m128 xmm0, xmm1;
   135     xmm0 = _mm_loadu_ps(src1);
   136     xmm1 = _mm_loadu_ps(src2);
   137     xmm0 = _mm_sub_ps(xmm0, xmm1);
   138     _mm_store_ps(dest, xmm0);
   139     dest += 4;
   140     src1 += 4;
   141     src2 += 4;
   142   }
   143   for (; n > 0; n--) {
   144     *dest++ = *src1++ - *src2++;
   145   }
   146 }
   147 OIL_DEFINE_IMPL_FULL (subtract_f32_sse, subtract_f32, OIL_IMPL_FLAG_SSE);
   148 
   149 SSE_FUNCTION static void
   150 multiply_f32_sse (float *dest, float *src1, float *src2, int n)
   151 {
   152   /* Initial operations to align the destination pointer */
   153   for (; ((long)dest & 15) && (n > 0); n--) {
   154     *dest++ = *src1++ * *src2++;
   155   }
   156   for (; n >= 4; n -= 4) {
   157     __m128 xmm0, xmm1;
   158     xmm0 = _mm_loadu_ps(src1);
   159     xmm1 = _mm_loadu_ps(src2);
   160     xmm0 = _mm_mul_ps(xmm0, xmm1);
   161     _mm_store_ps(dest, xmm0);
   162     dest += 4;
   163     src1 += 4;
   164     src2 += 4;
   165   }
   166   for (; n > 0; n--) {
   167     *dest++ = *src1++ * *src2++;
   168   }
   169 }
   170 OIL_DEFINE_IMPL_FULL (multiply_f32_sse, multiply_f32, OIL_IMPL_FLAG_SSE);
   171 
   172 SSE_FUNCTION static void
   173 divide_f32_sse (float *dest, float *src1, float *src2, int n)
   174 {
   175   /* Initial operations to align the destination pointer */
   176   for (; ((long)dest & 15) && (n > 0); n--) {
   177     *dest++ = *src1++ / *src2++;
   178   }
   179   for (; n >= 4; n -= 4) {
   180     __m128 xmm0, xmm1;
   181     xmm0 = _mm_loadu_ps(src1);
   182     xmm1 = _mm_loadu_ps(src2);
   183     xmm0 = _mm_div_ps(xmm0, xmm1);
   184     _mm_store_ps(dest, xmm0);
   185     dest += 4;
   186     src1 += 4;
   187     src2 += 4;
   188   }
   189   for (; n > 0; n--) {
   190     *dest++ = *src1++ / *src2++;
   191   }
   192 }
   193 OIL_DEFINE_IMPL_FULL (divide_f32_sse, divide_f32, OIL_IMPL_FLAG_SSE);
   194 
   195 SSE_FUNCTION static void
   196 minimum_f32_sse (float *dest, float *src1, float *src2, int n)
   197 {
   198   /* Initial operations to align the destination pointer */
   199   for (; ((long)dest & 15) && (n > 0); n--) {
   200     *dest++ = *src1 < *src2 ? *src1 : *src2;
   201     src1++;
   202     src2++;
   203   }
   204   for (; n >= 4; n -= 4) {
   205     __m128 xmm0, xmm1;
   206     xmm0 = _mm_loadu_ps(src1);
   207     xmm1 = _mm_loadu_ps(src2);
   208     xmm0 = _mm_min_ps(xmm0, xmm1);
   209     _mm_store_ps(dest, xmm0);
   210     dest += 4;
   211     src1 += 4;
   212     src2 += 4;
   213   }
   214   for (; n > 0; n--) {
   215     *dest++ = *src1 < *src2 ? *src1 : *src2;
   216     src1++;
   217     src2++;
   218   }
   219 }
   220 OIL_DEFINE_IMPL_FULL (minimum_f32_sse, minimum_f32, OIL_IMPL_FLAG_SSE);
   221 
   222 SSE_FUNCTION static void
   223 maximum_f32_sse (float *dest, float *src1, float *src2, int n)
   224 {
   225   /* Initial operations to align the destination pointer */
   226   for (; ((long)dest & 15) && (n > 0); n--) {
   227     *dest++ = *src1 > *src2 ? *src1 : *src2;
   228     src1++;
   229     src2++;
   230   }
   231   for (; n >= 4; n -= 4) {
   232     __m128 xmm0, xmm1;
   233     xmm0 = _mm_loadu_ps(src1);
   234     xmm1 = _mm_loadu_ps(src2);
   235     xmm0 = _mm_max_ps(xmm0, xmm1);
   236     _mm_store_ps(dest, xmm0);
   237     dest += 4;
   238     src1 += 4;
   239     src2 += 4;
   240   }
   241   for (; n > 0; n--) {
   242     *dest++ = *src1 > *src2 ? *src1 : *src2;
   243     src1++;
   244     src2++;
   245   }
   246 }
   247 OIL_DEFINE_IMPL_FULL (maximum_f32_sse, maximum_f32, OIL_IMPL_FLAG_SSE);
   248 
   249 SSE_FUNCTION static void
   250 inverse_f32_sse (float *dest, float *src1, int n)
   251 {
   252   /* Initial operations to align the destination pointer */
   253   for (; ((long)dest & 15) && (n > 0); n--) {
   254     *dest++ = 1.0 / *src1++;
   255   }
   256   for (; n >= 4; n -= 4) {
   257     __m128 xmm0, xmm1;
   258     /* While _mm_rcp_ps sounds promising, the results it gives are rather
   259      * different from the 1.0 / src1 reference implementation, so do that.
   260      */
   261     xmm0 = _mm_set_ps1(1.0);
   262     xmm1 = _mm_loadu_ps(src1);
   263     xmm0 = _mm_div_ps(xmm0, xmm1);
   264     _mm_store_ps(dest, xmm0);
   265     dest += 4;
   266     src1 += 4;
   267   }
   268   for (; n > 0; n--) {
   269     *dest++ = 1.0 / *src1++;
   270   }
   271 }
   272 OIL_DEFINE_IMPL_FULL (inverse_f32_sse, inverse_f32, OIL_IMPL_FLAG_SSE);
   273 
   274 SSE_FUNCTION static void
   275 negative_f32_sse (float *dest, float *src1, int n)
   276 {
   277   /* Initial operations to align the destination pointer */
   278   for (; ((long)dest & 15) && (n > 0); n--) {
   279     *dest++ = -(*src1++);
   280   }
   281   for (; n >= 4; n -= 4) {
   282     __m128 xmm0, xmm1;
   283     xmm0 = _mm_setzero_ps();
   284     xmm1 = _mm_loadu_ps(src1);
   285     xmm0 = _mm_sub_ps(xmm0, xmm1);
   286     _mm_store_ps(dest, xmm0);
   287     dest += 4;
   288     src1 += 4;
   289   }
   290   for (; n > 0; n--) {
   291     *dest++ = -(*src1++);
   292   }
   293 }
   294 OIL_DEFINE_IMPL_FULL (negative_f32_sse, negative_f32, OIL_IMPL_FLAG_SSE);
   295 
   296 SSE_FUNCTION static void
   297 scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n)
   298 {
   299   __m128 xmm1;
   300 
   301   /* Initial operations to align the destination pointer */
   302   for (; ((long)dest & 15) && (n > 0); n--) {
   303     *dest++ = *src1++ + *val;
   304   }
   305   xmm1 = _mm_load_ps1(val);
   306   for (; n >= 4; n -= 4) {
   307     __m128 xmm0;
   308     xmm0 = _mm_loadu_ps(src1);
   309     xmm0 = _mm_add_ps(xmm0, xmm1);
   310     _mm_store_ps(dest, xmm0);
   311     dest += 4;
   312     src1 += 4;
   313   }
   314   for (; n > 0; n--) {
   315     *dest++ = *src1++ + *val;
   316   }
   317 }
   318 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
   319 
   320 SSE_FUNCTION static void
   321 scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n)
   322 {
   323   __m128 xmm1;
   324 
   325   /* Initial operations to align the destination pointer */
   326   for (; ((long)dest & 15) && (n > 0); n--) {
   327     *dest++ = *src1++ * *val;
   328   }
   329   xmm1 = _mm_load_ps1(val);
   330   for (; n >= 4; n -= 4) {
   331     __m128 xmm0;
   332     xmm0 = _mm_loadu_ps(src1);
   333     xmm0 = _mm_mul_ps(xmm0, xmm1);
   334     _mm_store_ps(dest, xmm0);
   335     dest += 4;
   336     src1 += 4;
   337   }
   338   for (; n > 0; n--) {
   339     *dest++ = *src1++ * *val;
   340   }
   341 }
   342 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
   343 
   344 SSE_FUNCTION static void
   345 scalarmultiply_f64_ns_sse2 (double *dest, double *src1, double *val, int n)
   346 {
   347   __m128d xmm1;
   348 
   349   /* Initial operations to align the destination pointer */
   350   for (; ((long)dest & 15) && (n > 0); n--) {
   351     *dest++ = *src1++ * *val;
   352   }
   353   xmm1 = _mm_load_pd1(val);
   354   for (; n >= 2; n -= 2) {
   355     __m128d xmm0;
   356     xmm0 = _mm_loadu_pd(src1);
   357     xmm0 = _mm_mul_pd(xmm0, xmm1);
   358     _mm_store_pd(dest, xmm0);
   359     dest += 2;
   360     src1 += 2;
   361   }
   362   for (; n > 0; n--) {
   363     *dest++ = *src1++ * *val;
   364   }
   365 }
   366 OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
   367 
   368 
   369 
   370 #ifdef	__SYMBIAN32__
   371  
   372 OilFunctionImpl* __oil_function_impl_add_f32_sse, add_f32() {
   373 		return &_oil_function_impl_add_f32_sse, add_f32;
   374 }
   375 #endif
   376 
   377 #ifdef	__SYMBIAN32__
   378  
   379 OilFunctionImpl* __oil_function_impl_add_f64_sse2, add_f64() {
   380 		return &_oil_function_impl_add_f64_sse2, add_f64;
   381 }
   382 #endif
   383 
   384 #ifdef	__SYMBIAN32__
   385  
   386 OilFunctionImpl* __oil_function_impl_add_f64_sse2_unroll, add_f64() {
   387 		return &_oil_function_impl_add_f64_sse2_unroll, add_f64;
   388 }
   389 #endif
   390 
   391 #ifdef	__SYMBIAN32__
   392  
   393 OilFunctionImpl* __oil_function_impl_subtract_f32_sse, subtract_f32() {
   394 		return &_oil_function_impl_subtract_f32_sse, subtract_f32;
   395 }
   396 #endif
   397 
   398 #ifdef	__SYMBIAN32__
   399  
   400 OilFunctionImpl* __oil_function_impl_multiply_f32_sse, multiply_f32() {
   401 		return &_oil_function_impl_multiply_f32_sse, multiply_f32;
   402 }
   403 #endif
   404 
   405 #ifdef	__SYMBIAN32__
   406  
   407 OilFunctionImpl* __oil_function_impl_divide_f32_sse, divide_f32() {
   408 		return &_oil_function_impl_divide_f32_sse, divide_f32;
   409 }
   410 #endif
   411 
   412 #ifdef	__SYMBIAN32__
   413  
   414 OilFunctionImpl* __oil_function_impl_minimum_f32_sse, minimum_f32() {
   415 		return &_oil_function_impl_minimum_f32_sse, minimum_f32;
   416 }
   417 #endif
   418 
   419 #ifdef	__SYMBIAN32__
   420  
   421 OilFunctionImpl* __oil_function_impl_maximum_f32_sse, maximum_f32() {
   422 		return &_oil_function_impl_maximum_f32_sse, maximum_f32;
   423 }
   424 #endif
   425 
   426 #ifdef	__SYMBIAN32__
   427  
   428 OilFunctionImpl* __oil_function_impl_inverse_f32_sse, inverse_f32() {
   429 		return &_oil_function_impl_inverse_f32_sse, inverse_f32;
   430 }
   431 #endif
   432 
   433 #ifdef	__SYMBIAN32__
   434  
   435 OilFunctionImpl* __oil_function_impl_negative_f32_sse, negative_f32() {
   436 		return &_oil_function_impl_negative_f32_sse, negative_f32;
   437 }
   438 #endif
   439 
   440 #ifdef	__SYMBIAN32__
   441  
   442 OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns() {
   443 		return &_oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns;
   444 }
   445 #endif
   446 
   447 #ifdef	__SYMBIAN32__
   448  
   449 OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns() {
   450 		return &_oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns;
   451 }
   452 #endif
   453 
   454 #ifdef	__SYMBIAN32__
   455  
   456 OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns() {
   457 		return &_oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns;
   458 }
   459 #endif
   460